From 0fc31a70b87e5af2b0a70bb2d38ec00c76c6b965 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 23 Jun 2023 14:19:35 -0700 Subject: [PATCH] Write string data directly to column_buffer in Parquet reader (#13302) The current Parquet reader decodes string data into a list of {ptr, length} tuples, which are then used in a gather step by `make_strings_column`. This gather step can be time consuming, especially when there are a large number of string columns. This PR addresses this by changing the decode step to write char and offset data directly to the `column_buffer`, which can then be used directly, bypassing the gather step. The image below compares the new approach to the old. The green arc at the top (82ms) is `gpuDecodePageData`, and the red arc (252ms) is the time spent in `make_strings_column`. The green arc below (25ms) is `gpuDecodePageData`, the amber arc (22ms) is a new kernel that computes string sizes for each page, and the magenta arc (106ms) is the kernel that decodes string columns. ![flat_edited](https://user-images.githubusercontent.com/25541553/236529570-f2d0d8d4-b2b5-4078-93ae-5123fa489c3c.png) NVbench shows a good speed up for strings as well. There is a jump in time for the INTEGRAL benchmark, but little to no change for other data types. The INTEGRAL time seems to be affected by extra time spent in `malloc` allocating host memory for a `hostdevice_vector`. This `malloc` always occurs, but for some reason in this branch it takes much longer to return. This is comparing to @nvdbaranec's branch for #13203. ``` | data_type | io | cardinality | run_length | Ref Time | Cmp Time | Diff | %Diff | |-------------|---------------|---------------|--------------|------------|------------|-------------|---------| | INTEGRAL | DEVICE_BUFFER | 0 | 1 | 14.288 ms | 14.729 ms | 440.423 us | 3.08% | | INTEGRAL | DEVICE_BUFFER | 1000 | 1 | 13.397 ms | 13.997 ms | 600.596 us | 4.48% | | INTEGRAL | DEVICE_BUFFER | 0 | 32 | 11.831 ms | 12.354 ms | 522.485 us | 4.42% | | INTEGRAL | DEVICE_BUFFER | 1000 | 32 | 11.335 ms | 11.854 ms | 518.791 us | 4.58% | | FLOAT | DEVICE_BUFFER | 0 | 1 | 8.681 ms | 8.715 ms | 34.846 us | 0.40% | | FLOAT | DEVICE_BUFFER | 1000 | 1 | 8.473 ms | 8.472 ms | -0.680 us | -0.01% | | FLOAT | DEVICE_BUFFER | 0 | 32 | 7.217 ms | 7.192 ms | -25.311 us | -0.35% | | FLOAT | DEVICE_BUFFER | 1000 | 32 | 7.425 ms | 7.422 ms | -3.162 us | -0.04% | | STRING | DEVICE_BUFFER | 0 | 1 | 50.079 ms | 42.566 ms |-7513.004 us | -15.00% | | STRING | DEVICE_BUFFER | 1000 | 1 | 16.813 ms | 14.989 ms |-1823.660 us | -10.85% | | STRING | DEVICE_BUFFER | 0 | 32 | 49.875 ms | 42.443 ms |-7432.718 us | -14.90% | | STRING | DEVICE_BUFFER | 1000 | 32 | 15.312 ms | 13.953 ms |-1358.910 us | -8.87% | | LIST | DEVICE_BUFFER | 0 | 1 | 80.303 ms | 80.688 ms | 385.916 us | 0.48% | | LIST | DEVICE_BUFFER | 1000 | 1 | 71.921 ms | 72.356 ms | 435.153 us | 0.61% | | LIST | DEVICE_BUFFER | 0 | 32 | 61.658 ms | 62.129 ms | 471.022 us | 0.76% | | LIST | DEVICE_BUFFER | 1000 | 32 | 63.086 ms | 63.371 ms | 285.608 us | 0.45% | | STRUCT | DEVICE_BUFFER | 0 | 1 | 66.272 ms | 61.142 ms |-5130.639 us | -7.74% | | STRUCT | DEVICE_BUFFER | 1000 | 1 | 40.217 ms | 39.328 ms | -888.781 us | -2.21% | | STRUCT | DEVICE_BUFFER | 0 | 32 | 63.660 ms | 58.837 ms |-4822.647 us | -7.58% | | STRUCT | DEVICE_BUFFER | 1000 | 32 | 38.080 ms | 37.104 ms | -976.133 us | -2.56% | ``` May address #13024 ~Depends on #13203~ Authors: - Ed Seidl (https://github.com/etseidl) - https://github.com/nvdbaranec - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Mike Wilson (https://github.com/hyperbolic2346) - https://github.com/nvdbaranec - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/13302 --- cpp/CMakeLists.txt | 1 + cpp/benchmarks/common/generate_input.cu | 19 + cpp/benchmarks/common/generate_input.hpp | 15 + .../io/parquet/parquet_reader_input.cpp | 41 + cpp/src/io/orc/reader_impl.cu | 6 +- cpp/src/io/parquet/page_data.cu | 1277 +--------------- cpp/src/io/parquet/page_decode.cuh | 1327 +++++++++++++++++ cpp/src/io/parquet/page_string_decode.cu | 865 +++++++++++ cpp/src/io/parquet/parquet_gpu.hpp | 63 +- cpp/src/io/parquet/reader_impl.cpp | 104 +- cpp/src/io/parquet/reader_impl.hpp | 14 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 14 +- cpp/src/io/parquet/reader_impl_helpers.hpp | 2 +- cpp/src/io/parquet/reader_impl_preprocess.cu | 95 +- cpp/src/io/utilities/column_buffer.cpp | 181 ++- cpp/src/io/utilities/column_buffer.hpp | 182 ++- 16 files changed, 2803 insertions(+), 1403 deletions(-) create mode 100644 cpp/src/io/parquet/page_decode.cuh create mode 100644 cpp/src/io/parquet/page_string_decode.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d9f3824f706..0742d039092 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -392,6 +392,7 @@ add_library( src/io/parquet/chunk_dict.cu src/io/parquet/page_enc.cu src/io/parquet/page_hdr.cu + src/io/parquet/page_string_decode.cu src/io/parquet/reader.cpp src/io/parquet/reader_impl.cpp src/io/parquet/reader_impl_helpers.cpp diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index c2901dc61ee..28710f2a745 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -785,6 +785,25 @@ std::vector cycle_dtypes(std::vector const& dtype_ return out_dtypes; } +/** + * @brief Repeat the given two data types with a given ratio of a:b. + * + * The first dtype will have 'first_num' columns and the second will have 'num_cols - first_num' + * columns. + */ +std::vector mix_dtypes(std::pair const& dtype_ids, + cudf::size_type num_cols, + int first_num) +{ + std::vector out_dtypes; + out_dtypes.reserve(num_cols); + for (cudf::size_type col = 0; col < first_num; ++col) + out_dtypes.push_back(dtype_ids.first); + for (cudf::size_type col = first_num; col < num_cols; ++col) + out_dtypes.push_back(dtype_ids.second); + return out_dtypes; +} + std::unique_ptr create_random_table(std::vector const& dtype_ids, table_size_bytes table_bytes, data_profile const& profile, diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp index 8a5811218d0..a2efdb819bf 100644 --- a/cpp/benchmarks/common/generate_input.hpp +++ b/cpp/benchmarks/common/generate_input.hpp @@ -666,6 +666,21 @@ std::unique_ptr create_sequence_table( */ std::vector cycle_dtypes(std::vector const& dtype_ids, cudf::size_type num_cols); + +/** + * @brief Repeat the given two data types with a given ratio of a:b. + * + * The first dtype will have 'first_num' columns and the second will have 'num_cols - first_num' + * columns. + * + * @param dtype_ids Pair of requested column types + * @param num_cols Total number of columns in the output vector + * @param first_num Total number of columns of type `dtype_ids.first` + * @return A vector of type_ids + */ +std::vector mix_dtypes(std::pair const& dtype_ids, + cudf::size_type num_cols, + int first_num); /** * @brief Create a random null mask object * diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index a8d40492890..9d1026f80fc 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -114,6 +114,38 @@ void BM_parquet_read_io_compression( parquet_read_common(write_opts, source_sink, state); } +template +void BM_parquet_read_io_small_mixed(nvbench::state& state, + nvbench::type_list>) +{ + auto const d_type = + std::pair{cudf::type_id::STRING, cudf::type_id::INT32}; + + cudf::size_type const cardinality = state.get_int64("cardinality"); + cudf::size_type const run_length = state.get_int64("run_length"); + cudf::size_type const num_strings = state.get_int64("num_string_cols"); + auto const source_type = IOType; + + // want 80 pages total, across 4 columns, so 20 pages per column + cudf::size_type constexpr n_col = 4; + cudf::size_type constexpr page_size_rows = 10'000; + cudf::size_type constexpr num_rows = page_size_rows * (80 / n_col); + + auto const tbl = + create_random_table(mix_dtypes(d_type, n_col, num_strings), + row_count{num_rows}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cuio_source_sink_pair source_sink(source_type); + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .max_page_size_rows(10'000) + .compression(cudf::io::compression_type::NONE); + + parquet_read_common(write_opts, source_sink, state); +} + template void BM_parquet_read_chunks( nvbench::state& state, @@ -203,3 +235,12 @@ NVBENCH_BENCH_TYPES(BM_parquet_read_chunks, .add_int64_axis("cardinality", {0, 1000}) .add_int64_axis("run_length", {1, 32}) .add_int64_axis("byte_limit", {0, 500'000}); + +NVBENCH_BENCH_TYPES(BM_parquet_read_io_small_mixed, + NVBENCH_TYPE_AXES(nvbench::enum_type_list)) + .set_name("parquet_read_io_small_mixed") + .set_type_axes_names({"io"}) + .set_min_samples(4) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}) + .add_int64_axis("num_string_cols", {1, 2, 3}); diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 429cf76e1f9..29d8916c961 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -525,13 +525,13 @@ void update_null_mask(cudf::detail::hostdevice_2dvector& chunks }; }); - out_buffers[col_idx]._null_mask = std::move(merged_null_mask); + out_buffers[col_idx].set_null_mask(std::move(merged_null_mask)); } else { // Since child column doesn't have a mask, copy parent null mask auto mask_size = bitmask_allocation_size_bytes(parent_mask_len); - out_buffers[col_idx]._null_mask = - rmm::device_buffer(static_cast(parent_valid_map_base), mask_size, stream, mr); + out_buffers[col_idx].set_null_mask( + rmm::device_buffer(static_cast(parent_valid_map_base), mask_size, stream, mr)); } } } diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 5fa7d924f82..e49378485fc 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -14,31 +14,11 @@ * limitations under the License. */ -#include "parquet_gpu.hpp" -#include "rle_stream.cuh" -#include +#include "page_decode.cuh" + #include -#include -#include #include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include namespace cudf { namespace io { @@ -47,548 +27,6 @@ namespace gpu { namespace { -constexpr int preprocess_block_size = num_rle_stream_decode_threads; // 512 -constexpr int decode_block_size = 128; -constexpr int non_zero_buffer_size = decode_block_size * 2; -constexpr int rolling_index(int index) { return index & (non_zero_buffer_size - 1); } -template -constexpr int rolling_lvl_index(int index) -{ - return index % lvl_buf_size; -} - -struct page_state_s { - uint8_t const* data_start; - uint8_t const* data_end; - uint8_t const* lvl_end; - uint8_t const* dict_base; // ptr to dictionary page data - int32_t dict_size; // size of dictionary data - int32_t first_row; // First row in page to output - int32_t num_rows; // Rows in page to decode (including rows to be skipped) - int32_t first_output_value; // First value in page to output - int32_t num_input_values; // total # of input/level values in the page - int32_t dtype_len; // Output data type length - int32_t dtype_len_in; // Can be larger than dtype_len if truncating 32-bit into 8-bit - int32_t dict_bits; // # of bits to store dictionary indices - uint32_t dict_run; - int32_t dict_val; - uint32_t initial_rle_run[NUM_LEVEL_TYPES]; // [def,rep] - int32_t initial_rle_value[NUM_LEVEL_TYPES]; // [def,rep] - int32_t error; - PageInfo page; - ColumnChunkDesc col; - - // (leaf) value decoding - int32_t nz_count; // number of valid entries in nz_idx (write position in circular buffer) - int32_t dict_pos; // write position of dictionary indices - int32_t src_pos; // input read position of final output value - int32_t ts_scale; // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale - - // repetition/definition level decoding - int32_t input_value_count; // how many values of the input we've processed - int32_t input_row_count; // how many rows of the input we've processed - int32_t input_leaf_count; // how many leaf values of the input we've processed - uint8_t const* lvl_start[NUM_LEVEL_TYPES]; // [def,rep] - uint8_t const* abs_lvl_start[NUM_LEVEL_TYPES]; // [def,rep] - uint8_t const* abs_lvl_end[NUM_LEVEL_TYPES]; // [def,rep] - int32_t lvl_count[NUM_LEVEL_TYPES]; // how many of each of the streams we've decoded - int32_t row_index_lower_bound; // lower bound of row indices we should process - - // a shared-memory cache of frequently used data when decoding. The source of this data is - // normally stored in global memory which can yield poor performance. So, when possible - // we copy that info here prior to decoding - PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info]; - // points to either nesting_decode_cache above when possible, or to the global source otherwise - PageNestingDecodeInfo* nesting_info; -}; - -// buffers only used in the decode kernel. separated from page_state_s to keep -// shared memory usage in other kernels (eg, gpuComputePageSizes) down. -struct page_state_buffers_s { - uint32_t nz_idx[non_zero_buffer_size]; // circular buffer of non-null value positions - uint32_t dict_idx[non_zero_buffer_size]; // Dictionary index, boolean, or string offset values - uint32_t str_len[non_zero_buffer_size]; // String length for plain encoding of strings -}; - -/** - * @brief Returns whether or not a page spans either the beginning or the end of the - * specified row bounds - * - * @param s The page to be checked - * @param start_row The starting row index - * @param num_rows The number of rows - * - * @return True if the page spans the beginning or the end of the row bounds - */ -inline __device__ bool is_bounds_page(page_state_s* const s, size_t start_row, size_t num_rows) -{ - size_t const page_begin = s->col.start_row + s->page.chunk_row; - size_t const page_end = page_begin + s->page.num_rows; - size_t const begin = start_row; - size_t const end = start_row + num_rows; - - return ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end)); -} - -/** - * @brief Returns whether or not a page is completely contained within the specified - * row bounds - * - * @param s The page to be checked - * @param start_row The starting row index - * @param num_rows The number of rows - * - * @return True if the page is completely contained within the row bounds - */ -inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row, size_t num_rows) -{ - size_t const page_begin = s->col.start_row + s->page.chunk_row; - size_t const page_end = page_begin + s->page.num_rows; - size_t const begin = start_row; - size_t const end = start_row + num_rows; - - return page_begin >= begin && page_end <= end; -} - -/** - * @brief Parse the beginning of the level section (definition or repetition), - * initializes the initial RLE run & value, and returns the section length - * - * @param[in,out] s The page state - * @param[in] cur The current data position - * @param[in] end The end of the data - * @param[in] level_bits The bits required - * @param[in] is_decode_step True if we are performing the decode step. - * @param[in,out] decoders The repetition and definition level stream decoders - * - * @return The length of the section - */ -__device__ uint32_t InitLevelSection(page_state_s* s, - uint8_t const* cur, - uint8_t const* end, - level_type lvl) -{ - int32_t len; - int level_bits = s->col.level_bits[lvl]; - Encoding encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding - : s->page.repetition_level_encoding; - - auto start = cur; - if (level_bits == 0) { - len = 0; - s->initial_rle_run[lvl] = s->page.num_input_values * 2; // repeated value - s->initial_rle_value[lvl] = 0; - s->lvl_start[lvl] = cur; - s->abs_lvl_start[lvl] = cur; - } else if (encoding == Encoding::RLE) { - // V2 only uses RLE encoding, so only perform check here - if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) { - len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes; - } else if (cur + 4 < end) { - len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24); - cur += 4; - } else { - len = 0; - s->error = 2; - } - s->abs_lvl_start[lvl] = cur; - if (!s->error) { - uint32_t run = get_vlq32(cur, end); - s->initial_rle_run[lvl] = run; - if (!(run & 1)) { - int v = (cur < end) ? cur[0] : 0; - cur++; - if (level_bits > 8) { - v |= ((cur < end) ? cur[0] : 0) << 8; - cur++; - } - s->initial_rle_value[lvl] = v; - } - s->lvl_start[lvl] = cur; - } - - if (cur > end) { s->error = 2; } - } else if (encoding == Encoding::BIT_PACKED) { - len = (s->page.num_input_values * level_bits + 7) >> 3; - s->initial_rle_run[lvl] = ((s->page.num_input_values + 7) >> 3) * 2 + 1; // literal run - s->initial_rle_value[lvl] = 0; - s->lvl_start[lvl] = cur; - s->abs_lvl_start[lvl] = cur; - } else { - s->error = 3; - len = 0; - } - - s->abs_lvl_end[lvl] = start + len; - - return static_cast(len); -} - -/** - * @brief Decode values out of a definition or repetition stream - * - * @param[in,out] s Page state input/output - * @param[in] t target_count Target count of stream values on output - * @param[in] t Warp0 thread ID (0..31) - * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION - */ -template -__device__ void gpuDecodeStream( - level_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl) -{ - uint8_t const* cur_def = s->lvl_start[lvl]; - uint8_t const* end = s->lvl_end; - uint32_t level_run = s->initial_rle_run[lvl]; - int32_t level_val = s->initial_rle_value[lvl]; - int level_bits = s->col.level_bits[lvl]; - int32_t num_input_values = s->num_input_values; - int32_t value_count = s->lvl_count[lvl]; - int32_t batch_coded_count = 0; - - while (value_count < target_count && value_count < num_input_values) { - int batch_len; - if (level_run <= 1) { - // Get a new run symbol from the byte stream - int sym_len = 0; - if (!t) { - uint8_t const* cur = cur_def; - if (cur < end) { level_run = get_vlq32(cur, end); } - if (!(level_run & 1)) { - if (cur < end) level_val = cur[0]; - cur++; - if (level_bits > 8) { - if (cur < end) level_val |= cur[0] << 8; - cur++; - } - } - if (cur > end || level_run <= 1) { s->error = 0x10; } - sym_len = (int32_t)(cur - cur_def); - __threadfence_block(); - } - sym_len = shuffle(sym_len); - level_val = shuffle(level_val); - level_run = shuffle(level_run); - cur_def += sym_len; - } - if (s->error) { break; } - - batch_len = min(num_input_values - value_count, 32); - if (level_run & 1) { - // Literal run - int batch_len8; - batch_len = min(batch_len, (level_run >> 1) * 8); - batch_len8 = (batch_len + 7) >> 3; - if (t < batch_len) { - int bitpos = t * level_bits; - uint8_t const* cur = cur_def + (bitpos >> 3); - bitpos &= 7; - if (cur < end) level_val = cur[0]; - cur++; - if (level_bits > 8 - bitpos && cur < end) { - level_val |= cur[0] << 8; - cur++; - if (level_bits > 16 - bitpos && cur < end) level_val |= cur[0] << 16; - } - level_val = (level_val >> bitpos) & ((1 << level_bits) - 1); - } - level_run -= batch_len8 * 2; - cur_def += batch_len8 * level_bits; - } else { - // Repeated value - batch_len = min(batch_len, level_run >> 1); - level_run -= batch_len * 2; - } - if (t < batch_len) { - int idx = value_count + t; - output[rolling_index(idx)] = level_val; - } - batch_coded_count += batch_len; - value_count += batch_len; - } - - // update the stream info - if (!t) { - s->lvl_start[lvl] = cur_def; - s->initial_rle_run[lvl] = level_run; - s->initial_rle_value[lvl] = level_val; - s->lvl_count[lvl] = value_count; - } -} - -/** - * @brief Performs RLE decoding of dictionary indexes - * - * @param[in,out] s Page state input/output - * @param[out] sb Page state buffer output - * @param[in] target_pos Target index position in dict_idx buffer (may exceed this value by up to - * 31) - * @param[in] t Warp1 thread ID (0..31) - * - * @return A pair containing the new output position, and the total length of strings decoded (this - * will only be valid on thread 0 and if sizes_only is true). In the event that this function - * decodes strings beyond target_pos, the total length of strings returned will include these - * additional values. - */ -template -__device__ cuda::std::pair gpuDecodeDictionaryIndices( - volatile page_state_s* s, - [[maybe_unused]] volatile page_state_buffers_s* sb, - int target_pos, - int t) -{ - uint8_t const* end = s->data_end; - int dict_bits = s->dict_bits; - int pos = s->dict_pos; - int str_len = 0; - - while (pos < target_pos) { - int is_literal, batch_len; - if (!t) { - uint32_t run = s->dict_run; - uint8_t const* cur = s->data_start; - if (run <= 1) { - run = (cur < end) ? get_vlq32(cur, end) : 0; - if (!(run & 1)) { - // Repeated value - int bytecnt = (dict_bits + 7) >> 3; - if (cur + bytecnt <= end) { - int32_t run_val = cur[0]; - if (bytecnt > 1) { - run_val |= cur[1] << 8; - if (bytecnt > 2) { - run_val |= cur[2] << 16; - if (bytecnt > 3) { run_val |= cur[3] << 24; } - } - } - s->dict_val = run_val & ((1 << dict_bits) - 1); - } - cur += bytecnt; - } - } - if (run & 1) { - // Literal batch: must output a multiple of 8, except for the last batch - int batch_len_div8; - batch_len = max(min(32, (int)(run >> 1) * 8), 1); - batch_len_div8 = (batch_len + 7) >> 3; - run -= batch_len_div8 * 2; - cur += batch_len_div8 * dict_bits; - } else { - batch_len = max(min(32, (int)(run >> 1)), 1); - run -= batch_len * 2; - } - s->dict_run = run; - s->data_start = cur; - is_literal = run & 1; - __threadfence_block(); - } - __syncwarp(); - is_literal = shuffle(is_literal); - batch_len = shuffle(batch_len); - - // compute dictionary index. - int dict_idx = 0; - if (t < batch_len) { - dict_idx = s->dict_val; - if (is_literal) { - int32_t ofs = (t - ((batch_len + 7) & ~7)) * dict_bits; - uint8_t const* p = s->data_start + (ofs >> 3); - ofs &= 7; - if (p < end) { - uint32_t c = 8 - ofs; - dict_idx = (*p++) >> ofs; - if (c < dict_bits && p < end) { - dict_idx |= (*p++) << c; - c += 8; - if (c < dict_bits && p < end) { - dict_idx |= (*p++) << c; - c += 8; - if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; } - } - } - dict_idx &= (1 << dict_bits) - 1; - } - } - - // if we're not computing sizes, store off the dictionary index - if constexpr (!sizes_only) { sb->dict_idx[rolling_index(pos + t)] = dict_idx; } - } - - // if we're computing sizes, add the length(s) - if constexpr (sizes_only) { - int const len = [&]() { - if (t >= batch_len || (pos + t >= target_pos)) { return 0; } - uint32_t const dict_pos = (s->dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0; - if (dict_pos < (uint32_t)s->dict_size) { - const auto* src = reinterpret_cast(s->dict_base + dict_pos); - return src->second; - } - return 0; - }(); - - using WarpReduce = cub::WarpReduce; - __shared__ typename WarpReduce::TempStorage temp_storage; - // note: str_len will only be valid on thread 0. - str_len += WarpReduce(temp_storage).Sum(len); - } - - pos += batch_len; - } - return {pos, str_len}; -} - -/** - * @brief Performs RLE decoding of dictionary indexes, for when dict_size=1 - * - * @param[in,out] s Page state input/output - * @param[out] sb Page state buffer output - * @param[in] target_pos Target write position - * @param[in] t Thread ID - * - * @return The new output position - */ -__device__ int gpuDecodeRleBooleans(volatile page_state_s* s, - volatile page_state_buffers_s* sb, - int target_pos, - int t) -{ - uint8_t const* end = s->data_end; - int pos = s->dict_pos; - - while (pos < target_pos) { - int is_literal, batch_len; - if (!t) { - uint32_t run = s->dict_run; - uint8_t const* cur = s->data_start; - if (run <= 1) { - run = (cur < end) ? get_vlq32(cur, end) : 0; - if (!(run & 1)) { - // Repeated value - s->dict_val = (cur < end) ? cur[0] & 1 : 0; - cur++; - } - } - if (run & 1) { - // Literal batch: must output a multiple of 8, except for the last batch - int batch_len_div8; - batch_len = max(min(32, (int)(run >> 1) * 8), 1); - if (batch_len >= 8) { batch_len &= ~7; } - batch_len_div8 = (batch_len + 7) >> 3; - run -= batch_len_div8 * 2; - cur += batch_len_div8; - } else { - batch_len = max(min(32, (int)(run >> 1)), 1); - run -= batch_len * 2; - } - s->dict_run = run; - s->data_start = cur; - is_literal = run & 1; - __threadfence_block(); - } - __syncwarp(); - is_literal = shuffle(is_literal); - batch_len = shuffle(batch_len); - if (t < batch_len) { - int dict_idx; - if (is_literal) { - int32_t ofs = t - ((batch_len + 7) & ~7); - uint8_t const* p = s->data_start + (ofs >> 3); - dict_idx = (p < end) ? (p[0] >> (ofs & 7u)) & 1 : 0; - } else { - dict_idx = s->dict_val; - } - sb->dict_idx[rolling_index(pos + t)] = dict_idx; - } - pos += batch_len; - } - return pos; -} - -/** - * @brief Parses the length and position of strings and returns total length of all strings - * processed - * - * @param[in,out] s Page state input/output - * @param[out] sb Page state buffer output - * @param[in] target_pos Target output position - * @param[in] t Thread ID - * - * @return Total length of strings processed - */ -template -__device__ size_type gpuInitStringDescriptors(volatile page_state_s* s, - [[maybe_unused]] volatile page_state_buffers_s* sb, - int target_pos, - int t) -{ - int pos = s->dict_pos; - int total_len = 0; - - // This step is purely serial - if (!t) { - uint8_t const* cur = s->data_start; - int dict_size = s->dict_size; - int k = s->dict_val; - - while (pos < target_pos) { - int len; - if (k + 4 <= dict_size) { - len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24); - k += 4; - if (k + len > dict_size) { len = 0; } - } else { - len = 0; - } - if constexpr (!sizes_only) { - sb->dict_idx[rolling_index(pos)] = k; - sb->str_len[rolling_index(pos)] = len; - } - k += len; - total_len += len; - pos++; - } - s->dict_val = k; - __threadfence_block(); - } - - return total_len; -} - -/** - * @brief Retrieves string information for a string at the specified source position - * - * @param[in] s Page state input - * @param[out] sb Page state buffer output - * @param[in] src_pos Source position - * - * @return A pair containing a pointer to the string and its length - */ -inline __device__ cuda::std::pair gpuGetStringData( - volatile page_state_s* s, volatile page_state_buffers_s* sb, int src_pos) -{ - char const* ptr = nullptr; - size_t len = 0; - - if (s->dict_base) { - // String dictionary - uint32_t dict_pos = - (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) : 0; - if (dict_pos < (uint32_t)s->dict_size) { - auto const* src = reinterpret_cast(s->dict_base + dict_pos); - ptr = src->first; - len = src->second; - } - } else { - // Plain encoding - uint32_t dict_pos = sb->dict_idx[rolling_index(src_pos)]; - if (dict_pos <= (uint32_t)s->dict_size) { - ptr = reinterpret_cast(s->data_start + dict_pos); - len = sb->str_len[rolling_index(src_pos)]; - } - } - - return {ptr, len}; -} - /** * @brief Output a string descriptor * @@ -964,669 +402,6 @@ static __device__ void gpuOutputGeneric( } /** - * @brief Sets up block-local page state information from the global pages. - * - * @param[in, out] s The local page state to be filled in - * @param[in] p The global page to be copied from - * @param[in] chunks The global list of chunks - * @param[in] min_row Crop all rows below min_row - * @param[in] num_rows Maximum number of rows to read - * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess) - * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional. - * Currently only used by gpuComputePageSizes step) - */ -static __device__ bool setupLocalPageInfo(page_state_s* const s, - PageInfo const* p, - device_span chunks, - size_t min_row, - size_t num_rows, - bool is_decode_step) -{ - int t = threadIdx.x; - int chunk_idx; - - // Fetch page info - if (!t) { - s->page = *p; - s->nesting_info = nullptr; - } - __syncthreads(); - - if (s->page.flags & PAGEINFO_FLAGS_DICTIONARY) { return false; } - // Fetch column chunk info - chunk_idx = s->page.chunk_idx; - if (!t) { s->col = chunks[chunk_idx]; } - - // if we can use the nesting decode cache, set it up now - auto const can_use_decode_cache = s->page.nesting_info_size <= max_cacheable_nesting_decode_info; - if (can_use_decode_cache) { - int depth = 0; - while (depth < s->page.nesting_info_size) { - int const thread_depth = depth + t; - if (thread_depth < s->page.nesting_info_size) { - // these values need to be copied over from global - s->nesting_decode_cache[thread_depth].max_def_level = - s->page.nesting_decode[thread_depth].max_def_level; - s->nesting_decode_cache[thread_depth].page_start_value = - s->page.nesting_decode[thread_depth].page_start_value; - s->nesting_decode_cache[thread_depth].start_depth = - s->page.nesting_decode[thread_depth].start_depth; - s->nesting_decode_cache[thread_depth].end_depth = - s->page.nesting_decode[thread_depth].end_depth; - } - depth += blockDim.x; - } - } - if (!t) { - s->nesting_info = can_use_decode_cache ? s->nesting_decode_cache : s->page.nesting_decode; - } - - __syncthreads(); - - // zero counts - int depth = 0; - while (depth < s->page.num_output_nesting_levels) { - int const thread_depth = depth + t; - if (thread_depth < s->page.num_output_nesting_levels) { - s->nesting_info[thread_depth].valid_count = 0; - s->nesting_info[thread_depth].value_count = 0; - s->nesting_info[thread_depth].null_count = 0; - } - depth += blockDim.x; - } - __syncthreads(); - - if (!t) { - s->error = 0; - - // our starting row (absolute index) is - // col.start_row == absolute row index - // page.chunk-row == relative row index within the chunk - size_t page_start_row = s->col.start_row + s->page.chunk_row; - - // IMPORTANT : nested schemas can have 0 rows in a page but still have - // values. The case is: - // - On page N-1, the last row starts, with 2/6 values encoded - // - On page N, the remaining 4/6 values are encoded, but there are no new rows. - // if (s->page.num_input_values > 0 && s->page.num_rows > 0) { - if (s->page.num_input_values > 0) { - uint8_t* cur = s->page.page_data; - uint8_t* end = cur + s->page.uncompressed_page_size; - - uint32_t dtype_len_out = s->col.data_type >> 3; - s->ts_scale = 0; - // Validate data type - auto const data_type = s->col.data_type & 7; - switch (data_type) { - case BOOLEAN: - s->dtype_len = 1; // Boolean are stored as 1 byte on the output - break; - case INT32: [[fallthrough]]; - case FLOAT: s->dtype_len = 4; break; - case INT64: - if (s->col.ts_clock_rate) { - int32_t units = 0; - // Duration types are not included because no scaling is done when reading - if (s->col.converted_type == TIMESTAMP_MILLIS) { - units = cudf::timestamp_ms::period::den; - } else if (s->col.converted_type == TIMESTAMP_MICROS) { - units = cudf::timestamp_us::period::den; - } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) { - units = cudf::timestamp_ns::period::den; - } - if (units and units != s->col.ts_clock_rate) { - s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate) - : (s->col.ts_clock_rate / units); - } - } - [[fallthrough]]; - case DOUBLE: s->dtype_len = 8; break; - case INT96: s->dtype_len = 12; break; - case BYTE_ARRAY: - if (s->col.converted_type == DECIMAL) { - auto const decimal_precision = s->col.decimal_precision; - s->dtype_len = [decimal_precision]() { - if (decimal_precision <= MAX_DECIMAL32_PRECISION) { - return sizeof(int32_t); - } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) { - return sizeof(int64_t); - } else { - return sizeof(__int128_t); - } - }(); - } else { - s->dtype_len = sizeof(string_index_pair); - } - break; - default: // FIXED_LEN_BYTE_ARRAY: - s->dtype_len = dtype_len_out; - s->error |= (s->dtype_len <= 0); - break; - } - // Special check for downconversions - s->dtype_len_in = s->dtype_len; - if (s->col.converted_type == DECIMAL && data_type == FIXED_LEN_BYTE_ARRAY) { - s->dtype_len = [dtype_len = s->dtype_len]() { - if (dtype_len <= sizeof(int32_t)) { - return sizeof(int32_t); - } else if (dtype_len <= sizeof(int64_t)) { - return sizeof(int64_t); - } else { - return sizeof(__int128_t); - } - }(); - } else if (data_type == INT32) { - if (dtype_len_out == 1) { - // INT8 output - s->dtype_len = 1; - } else if (dtype_len_out == 2) { - // INT16 output - s->dtype_len = 2; - } else if (s->col.converted_type == TIME_MILLIS) { - // INT64 output - s->dtype_len = 8; - } - } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) { - s->dtype_len = 4; // HASH32 output - } else if (data_type == INT96) { - s->dtype_len = 8; // Convert to 64-bit timestamp - } - - // NOTE: s->page.num_rows, s->col.chunk_row, s->first_row and s->num_rows will be - // invalid/bogus during first pass of the preprocess step for nested types. this is ok - // because we ignore these values in that stage. - { - auto const max_row = min_row + num_rows; - - // if we are totally outside the range of the input, do nothing - if ((page_start_row > max_row) || (page_start_row + s->page.num_rows < min_row)) { - s->first_row = 0; - s->num_rows = 0; - } - // otherwise - else { - s->first_row = page_start_row >= min_row ? 0 : min_row - page_start_row; - auto const max_page_rows = s->page.num_rows - s->first_row; - s->num_rows = (page_start_row + s->first_row) + max_page_rows <= max_row - ? max_page_rows - : max_row - (page_start_row + s->first_row); - } - } - - // during the decoding step we need to offset the global output buffers - // for each level of nesting so that we write to the section this page - // is responsible for. - // - for flat schemas, we can do this directly by using row counts - // - for nested schemas, these offsets are computed during the preprocess step - // - // NOTE: in a chunked read situation, s->col.column_data_base and s->col.valid_map_base - // will be aliased to memory that has been freed when we get here in the non-decode step, so - // we cannot check against nullptr. we'll just check a flag directly. - if (is_decode_step) { - int max_depth = s->col.max_nesting_depth; - for (int idx = 0; idx < max_depth; idx++) { - PageNestingDecodeInfo* nesting_info = &s->nesting_info[idx]; - - size_t output_offset; - // schemas without lists - if (s->col.max_level[level_type::REPETITION] == 0) { - output_offset = page_start_row >= min_row ? page_start_row - min_row : 0; - } - // for schemas with lists, we've already got the exact value precomputed - else { - output_offset = nesting_info->page_start_value; - } - - nesting_info->data_out = static_cast(s->col.column_data_base[idx]); - - if (nesting_info->data_out != nullptr) { - // anything below max depth with a valid data pointer must be a list, so the - // element size is the size of the offset type. - uint32_t len = idx < max_depth - 1 ? sizeof(cudf::size_type) : s->dtype_len; - nesting_info->data_out += (output_offset * len); - } - nesting_info->valid_map = s->col.valid_map_base[idx]; - if (nesting_info->valid_map != nullptr) { - nesting_info->valid_map += output_offset >> 5; - nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f); - } - } - } - s->first_output_value = 0; - - // Find the compressed size of repetition levels - cur += InitLevelSection(s, cur, end, level_type::REPETITION); - // Find the compressed size of definition levels - cur += InitLevelSection(s, cur, end, level_type::DEFINITION); - - s->dict_bits = 0; - s->dict_base = nullptr; - s->dict_size = 0; - // NOTE: if additional encodings are supported in the future, modifications must - // be made to is_supported_encoding() in reader_impl_preprocess.cu - switch (s->page.encoding) { - case Encoding::PLAIN_DICTIONARY: - case Encoding::RLE_DICTIONARY: - // RLE-packed dictionary indices, first byte indicates index length in bits - if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) { - // String dictionary: use index - s->dict_base = reinterpret_cast(s->col.str_dict_index); - s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair); - } else { - s->dict_base = - s->col.page_info[0].page_data; // dictionary is always stored in the first page - s->dict_size = s->col.page_info[0].uncompressed_page_size; - } - s->dict_run = 0; - s->dict_val = 0; - s->dict_bits = (cur < end) ? *cur++ : 0; - if (s->dict_bits > 32 || !s->dict_base) { s->error = (10 << 8) | s->dict_bits; } - break; - case Encoding::PLAIN: - s->dict_size = static_cast(end - cur); - s->dict_val = 0; - if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; } - break; - case Encoding::RLE: s->dict_run = 0; break; - default: - s->error = 1; // Unsupported encoding - break; - } - if (cur > end) { s->error = 1; } - s->lvl_end = cur; - s->data_start = cur; - s->data_end = end; - } else { - s->error = 1; - } - - s->lvl_count[level_type::REPETITION] = 0; - s->lvl_count[level_type::DEFINITION] = 0; - s->nz_count = 0; - s->num_input_values = s->page.num_input_values; - s->dict_pos = 0; - s->src_pos = 0; - - // for flat hierarchies, we can't know how many leaf values to skip unless we do a full - // preprocess of the definition levels (since nulls will have no actual decodable value, there - // is no direct correlation between # of rows and # of decodable values). so we will start - // processing at the beginning of the value stream and disregard any indices that start - // before the first row. - if (s->col.max_level[level_type::REPETITION] == 0) { - s->page.skipped_values = 0; - s->page.skipped_leaf_values = 0; - s->input_value_count = 0; - s->input_row_count = 0; - s->input_leaf_count = 0; - - s->row_index_lower_bound = -1; - } - // for nested hierarchies, we have run a preprocess that lets us skip directly to the values - // we need to start decoding at - else { - // input_row_count translates to "how many rows we have processed so far", so since we are - // skipping directly to where we want to start decoding, set it to first_row - s->input_row_count = s->first_row; - - // return the lower bound to compare (page-relative) thread row index against. Explanation: - // In the case of nested schemas, rows can span page boundaries. That is to say, - // we can encounter the first value for row X on page M, but the last value for page M - // might not be the last value for row X. page M+1 (or further) may contain the last value. - // - // This means that the first values we encounter for a given page (M+1) may not belong to the - // row indicated by chunk_row, but to the row before it that spanned page boundaries. If that - // previous row is within the overall row bounds, include the values by allowing relative row - // index -1 - int const max_row = (min_row + num_rows) - 1; - if (min_row < page_start_row && max_row >= page_start_row - 1) { - s->row_index_lower_bound = -1; - } else { - s->row_index_lower_bound = s->first_row; - } - - // if we're in the decoding step, jump directly to the first - // value we care about - if (is_decode_step) { - s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0; - } else { - s->input_value_count = 0; - s->input_leaf_count = 0; - s->page.skipped_values = - -1; // magic number to indicate it hasn't been set for use inside UpdatePageSizes - s->page.skipped_leaf_values = 0; - } - } - - __threadfence_block(); - } - __syncthreads(); - - return true; -} - -/** - * @brief Store a validity mask containing value_count bits into the output validity buffer of the - * page. - * - * @param[in,out] nesting_info The page/nesting information to store the mask in. The validity map - * offset is also updated - * @param[in] valid_mask The validity mask to be stored - * @param[in] value_count # of bits in the validity mask - */ -static __device__ void store_validity(PageNestingDecodeInfo* nesting_info, - uint32_t valid_mask, - int32_t value_count) -{ - int word_offset = nesting_info->valid_map_offset / 32; - int bit_offset = nesting_info->valid_map_offset % 32; - // if we fit entirely in the output word - if (bit_offset + value_count <= 32) { - auto relevant_mask = static_cast((static_cast(1) << value_count) - 1); - - if (relevant_mask == ~0) { - nesting_info->valid_map[word_offset] = valid_mask; - } else { - atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset)); - atomicOr(nesting_info->valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset); - } - } - // we're going to spill over into the next word. - // note : writing both values here is the lazy/slow way. we could be writing just - // the first word and rolling the remaining bits over into the next call. - // however, some basic performance tests shows almost no difference between these two - // methods. More detailed performance testing might be worthwhile here. - else { - uint32_t bits_left = 32 - bit_offset; - - // first word. strip bits_left bits off the beginning and store that - uint32_t relevant_mask = ((1 << bits_left) - 1); - uint32_t mask_word0 = valid_mask & relevant_mask; - atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset)); - atomicOr(nesting_info->valid_map + word_offset, mask_word0 << bit_offset); - - // second word. strip the remainder of the bits off the end and store that - relevant_mask = ((1 << (value_count - bits_left)) - 1); - uint32_t mask_word1 = valid_mask & (relevant_mask << bits_left); - atomicAnd(nesting_info->valid_map + word_offset + 1, ~(relevant_mask)); - atomicOr(nesting_info->valid_map + word_offset + 1, mask_word1 >> bits_left); - } - - nesting_info->valid_map_offset += value_count; -} - -/** - * @brief Compute the nesting bounds within the hierarchy to add values to, and the definition level - * D to which we should considered them null or not. - * - * @param[out] start_depth The start nesting depth - * @param[out] end_depth The end nesting depth (inclusive) - * @param[out] d The definition level up to which added values are not-null. if t is out of bounds, - * d will be -1 - * @param[in] s Local page information - * @param[in] rep Repetition level buffer - * @param[in] def Definition level buffer - * @param[in] input_value_count The current count of input level values we have processed - * @param[in] target_input_value_count The desired # of input level values we want to process - * @param[in] t Thread index - */ -template -inline __device__ void get_nesting_bounds(int& start_depth, - int& end_depth, - int& d, - page_state_s* s, - level_t const* const rep, - level_t const* const def, - int input_value_count, - int32_t target_input_value_count, - int t) -{ - start_depth = -1; - end_depth = -1; - d = -1; - if (input_value_count + t < target_input_value_count) { - int const index = rolling_lvl_index(input_value_count + t); - d = static_cast(def[index]); - // if we have repetition (there are list columns involved) we have to - // bound what nesting levels we apply values to - if (s->col.max_level[level_type::REPETITION] > 0) { - level_t const r = rep[index]; - start_depth = s->nesting_info[r].start_depth; - end_depth = s->nesting_info[d].end_depth; - } - // for columns without repetition (even ones involving structs) we always - // traverse the entire hierarchy. - else { - start_depth = 0; - end_depth = s->col.max_nesting_depth - 1; - } - } -} - -/** - * @brief Process a batch of incoming repetition/definition level values and generate - * validity, nested column offsets (where appropriate) and decoding indices. - * - * @param[in] target_input_value_count The # of repetition/definition levels to process up to - * @param[in] s Local page information - * @param[out] sb Page state buffer output - * @param[in] rep Repetition level buffer - * @param[in] def Definition level buffer - * @param[in] t Thread index - */ -template -static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count, - page_state_s* s, - page_state_buffers_s* sb, - level_t const* const rep, - level_t const* const def, - int t) -{ - // max nesting depth of the column - int const max_depth = s->col.max_nesting_depth; - bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; - // how many (input) values we've processed in the page so far - int input_value_count = s->input_value_count; - // how many rows we've processed in the page so far - int input_row_count = s->input_row_count; - - PageNestingDecodeInfo* nesting_info_base = s->nesting_info; - - // process until we've reached the target - while (input_value_count < target_input_value_count) { - // determine the nesting bounds for this thread (the range of nesting depths we - // will generate new value indices and validity bits for) - int start_depth, end_depth, d; - get_nesting_bounds( - start_depth, end_depth, d, s, rep, def, input_value_count, target_input_value_count, t); - - // 4 interesting things to track: - // thread_value_count : # of output values from the view of this thread - // warp_value_count : # of output values for the whole warp - // - // thread_valid_count : # of valid values from the view of this thread - // warp_valid_count : # of valid values for the whole warp - uint32_t thread_value_count, warp_value_count; - uint32_t thread_valid_count, warp_valid_count; - - // track (page-relative) row index for the thread so we can compare against input bounds - // keep track of overall # of rows we've read. - int const is_new_row = start_depth == 0 ? 1 : 0; - uint32_t const warp_row_count_mask = ballot(is_new_row); - int32_t const thread_row_index = - input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1); - input_row_count += __popc(warp_row_count_mask); - // is this thread within read row bounds? - int const in_row_bounds = thread_row_index >= s->row_index_lower_bound && - thread_row_index < (s->first_row + s->num_rows) - ? 1 - : 0; - - // compute warp and thread value counts - uint32_t const warp_count_mask = - ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0); - - warp_value_count = __popc(warp_count_mask); - // Note : ((1 << t) - 1) implies "for all threads before me" - thread_value_count = __popc(warp_count_mask & ((1 << t) - 1)); - - // walk from 0 to max_depth - uint32_t next_thread_value_count, next_warp_value_count; - for (int s_idx = 0; s_idx < max_depth; s_idx++) { - PageNestingDecodeInfo* nesting_info = &nesting_info_base[s_idx]; - - // if we are within the range of nesting levels we should be adding value indices for - int const in_nesting_bounds = - ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0; - - // everything up to the max_def_level is a non-null value - uint32_t const is_valid = d >= nesting_info->max_def_level && in_nesting_bounds ? 1 : 0; - - // compute warp and thread valid counts - uint32_t const warp_valid_mask = - // for flat schemas, a simple ballot_sync gives us the correct count and bit positions - // because every value in the input matches to a value in the output - !has_repetition - ? ballot(is_valid) - : - // for nested schemas, it's more complicated. This warp will visit 32 incoming values, - // however not all of them will necessarily represent a value at this nesting level. so - // the validity bit for thread t might actually represent output value t-6. the correct - // position for thread t's bit is cur_value_count. for cuda 11 we could use - // __reduce_or_sync(), but until then we have to do a warp reduce. - WarpReduceOr32(is_valid << thread_value_count); - - thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1)); - warp_valid_count = __popc(warp_valid_mask); - - // if this is the value column emit an index for value decoding - if (is_valid && s_idx == max_depth - 1) { - int const src_pos = nesting_info->valid_count + thread_valid_count; - int const dst_pos = nesting_info->value_count + thread_value_count; - // nz_idx is a mapping of src buffer indices to destination buffer indices - sb->nz_idx[rolling_index(src_pos)] = dst_pos; - } - - // compute warp and thread value counts for the -next- nesting level. we need to - // do this for nested schemas so that we can emit an offset for the -current- nesting - // level. more concretely : the offset for the current nesting level == current length of the - // next nesting level - if (s_idx < max_depth - 1) { - uint32_t const next_warp_count_mask = - ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0); - next_warp_value_count = __popc(next_warp_count_mask); - next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1)); - - // if we're -not- at a leaf column and we're within nesting/row bounds - // and we have a valid data_out pointer, it implies this is a list column, so - // emit an offset. - if (in_nesting_bounds && nesting_info->data_out != nullptr) { - int const idx = nesting_info->value_count + thread_value_count; - cudf::size_type const ofs = nesting_info_base[s_idx + 1].value_count + - next_thread_value_count + - nesting_info_base[s_idx + 1].page_start_value; - (reinterpret_cast(nesting_info->data_out))[idx] = ofs; - } - } - - // nested schemas always read and write to the same bounds (that is, read and write positions - // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the - // first value, even if that is before first_row, because we cannot trivially jump to - // the correct position to start reading. since we are about to write the validity vector here - // we need to adjust our computed mask to take into account the write row bounds. - int const in_write_row_bounds = - !has_repetition - ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows) - : in_row_bounds; - int const first_thread_in_write_range = - !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0; - - // # of bits to of the validity mask to write out - int const warp_valid_mask_bit_count = - first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range; - - // increment count of valid values, count of total values, and update validity mask - if (!t) { - if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) { - uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range; - store_validity(nesting_info, warp_output_valid_mask, warp_valid_mask_bit_count); - - nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask); - } - nesting_info->valid_count += warp_valid_count; - nesting_info->value_count += warp_value_count; - } - - // propagate value counts for the next level - warp_value_count = next_warp_value_count; - thread_value_count = next_thread_value_count; - } - - input_value_count += min(32, (target_input_value_count - input_value_count)); - __syncwarp(); - } - - // update - if (!t) { - // update valid value count for decoding and total # of values we've processed - s->nz_count = nesting_info_base[max_depth - 1].valid_count; - s->input_value_count = input_value_count; - s->input_row_count = input_row_count; - } -} - -/** - * @brief Process repetition and definition levels up to the target count of leaf values. - * - * In order to decode actual leaf values from the input stream, we need to generate the - * list of non-null value positions (page_state_s::nz_idx). We do this by processing - * the repetition and definition level streams. This process also generates validity information, - * and offset column values in the case of nested schemas. Because of the way the streams - * are encoded, this function may generate slightly more than target_leaf_count. - * - * Only runs on 1 warp. - * - * @param[in] s The local page state - * @param[out] sb Page state buffer output - * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for - * @param[in] rep Repetition level buffer - * @param[in] def Definition level buffer - * @param[in] t Thread index - */ -template -__device__ void gpuDecodeLevels(page_state_s* s, - page_state_buffers_s* sb, - int32_t target_leaf_count, - level_t* const rep, - level_t* const def, - int t) -{ - bool has_repetition = s->col.max_level[level_type::REPETITION] > 0; - - constexpr int batch_size = 32; - int cur_leaf_count = target_leaf_count; - while (!s->error && s->nz_count < target_leaf_count && - s->input_value_count < s->num_input_values) { - if (has_repetition) { gpuDecodeStream(rep, s, cur_leaf_count, t, level_type::REPETITION); } - gpuDecodeStream(def, s, cur_leaf_count, t, level_type::DEFINITION); - __syncwarp(); - - // because the rep and def streams are encoded separately, we cannot request an exact - // # of values to be decoded at once. we can only process the lowest # of decoded rep/def - // levels we get. - int actual_leaf_count = has_repetition ? min(s->lvl_count[level_type::REPETITION], - s->lvl_count[level_type::DEFINITION]) - : s->lvl_count[level_type::DEFINITION]; - - // process what we got back - gpuUpdateValidityOffsetsAndRowIndices( - actual_leaf_count, s, sb, rep, def, t); - cur_leaf_count = actual_leaf_count + batch_size; - __syncwarp(); - } -} - -/** - * @brief Returns the total size in bytes of string char data in the page. * * This function expects the dictionary position to be at 0 and will traverse * the entire thing. @@ -1817,7 +592,7 @@ __global__ void __launch_bounds__(preprocess_block_size) rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}}; // setup page info - if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false)) { return; } + if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, all_types_filter{}, false)) { return; } // initialize the stream decoders (requires values computed in setupLocalPageInfo) int const max_batch_size = lvl_buf_size; @@ -1880,7 +655,7 @@ __global__ void __launch_bounds__(preprocess_block_size) // in the trim pass, for anything with lists, we only need to fully process bounding pages (those // at the beginning or the end of the row bounds) - if (!is_base_pass && !is_bounds_page(s, min_row, num_rows)) { + if (!is_base_pass && !is_bounds_page(s, min_row, num_rows, has_repetition)) { int depth = 0; while (depth < s->page.num_output_nesting_levels) { auto const thread_depth = depth + t; @@ -1961,28 +736,8 @@ __global__ void __launch_bounds__(preprocess_block_size) } } -// Copies null counts back to `nesting_decode` at the end of scope -struct null_count_back_copier { - page_state_s* s; - int t; - __device__ ~null_count_back_copier() - { - if (s->nesting_info != nullptr and s->nesting_info == s->nesting_decode_cache) { - int depth = 0; - while (depth < s->page.num_output_nesting_levels) { - int const thread_depth = depth + t; - if (thread_depth < s->page.num_output_nesting_levels) { - s->page.nesting_decode[thread_depth].null_count = - s->nesting_decode_cache[thread_depth].null_count; - } - depth += blockDim.x; - } - } - } -}; - /** - * @brief Kernel for co the column data stored in the pages + * @brief Kernel for computing the column data stored in the pages * * This function will write the page data and the page data's validity to the * output specified in the page's column chunk. If necessary, additional @@ -2008,25 +763,13 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( int out_thread0; [[maybe_unused]] null_count_back_copier _{s, t}; - if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; } - - bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; - - // if we have no work to do (eg, in a skip_rows/num_rows case) in this page. - // - // corner case: in the case of lists, we can have pages that contain "0" rows if the current row - // starts before this page and ends after this page: - // P0 P1 P2 - // |---------|---------|----------| - // ^------------------^ - // row start row end - // P1 will contain 0 rows - // - if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) || - is_page_contained(s, min_row, num_rows)))) { + if (!setupLocalPageInfo( + s, &pages[page_idx], chunks, min_row, num_rows, non_string_filter{chunks}, true)) { return; } + bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; + if (s->dict_base) { out_thread0 = (s->dict_bits > 0) ? 64 : 32; } else { @@ -2074,7 +817,7 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData( if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; } } else { // WARP1..WARP3: Decode values - int dtype = s->col.data_type & 7; + int const dtype = s->col.data_type & 7; src_pos += t - out_thread0; // the position in the output column/buffer diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh new file mode 100644 index 00000000000..4469ec59b7a --- /dev/null +++ b/cpp/src/io/parquet/page_decode.cuh @@ -0,0 +1,1327 @@ +/* + * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "parquet_gpu.hpp" +#include "rle_stream.cuh" + +#include + +#include + +namespace cudf::io::parquet::gpu { + +constexpr int preprocess_block_size = num_rle_stream_decode_threads; // 512 +constexpr int decode_block_size = 128; +constexpr int non_zero_buffer_size = decode_block_size * 2; + +constexpr int rolling_index(int index) { return index & (non_zero_buffer_size - 1); } +template +constexpr int rolling_lvl_index(int index) +{ + return index % lvl_buf_size; +} + +struct page_state_s { + uint8_t const* data_start; + uint8_t const* data_end; + uint8_t const* lvl_end; + uint8_t const* dict_base; // ptr to dictionary page data + int32_t dict_size; // size of dictionary data + int32_t first_row; // First row in page to output + int32_t num_rows; // Rows in page to decode (including rows to be skipped) + int32_t first_output_value; // First value in page to output + int32_t num_input_values; // total # of input/level values in the page + int32_t dtype_len; // Output data type length + int32_t dtype_len_in; // Can be larger than dtype_len if truncating 32-bit into 8-bit + int32_t dict_bits; // # of bits to store dictionary indices + uint32_t dict_run; + int32_t dict_val; + uint32_t initial_rle_run[NUM_LEVEL_TYPES]; // [def,rep] + int32_t initial_rle_value[NUM_LEVEL_TYPES]; // [def,rep] + int32_t error; + PageInfo page; + ColumnChunkDesc col; + + // (leaf) value decoding + int32_t nz_count; // number of valid entries in nz_idx (write position in circular buffer) + int32_t dict_pos; // write position of dictionary indices + int32_t src_pos; // input read position of final output value + int32_t ts_scale; // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale + + // repetition/definition level decoding + int32_t input_value_count; // how many values of the input we've processed + int32_t input_row_count; // how many rows of the input we've processed + int32_t input_leaf_count; // how many leaf values of the input we've processed + uint8_t const* lvl_start[NUM_LEVEL_TYPES]; // [def,rep] + uint8_t const* abs_lvl_start[NUM_LEVEL_TYPES]; // [def,rep] + uint8_t const* abs_lvl_end[NUM_LEVEL_TYPES]; // [def,rep] + int32_t lvl_count[NUM_LEVEL_TYPES]; // how many of each of the streams we've decoded + int32_t row_index_lower_bound; // lower bound of row indices we should process + + // a shared-memory cache of frequently used data when decoding. The source of this data is + // normally stored in global memory which can yield poor performance. So, when possible + // we copy that info here prior to decoding + PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info]; + // points to either nesting_decode_cache above when possible, or to the global source otherwise + PageNestingDecodeInfo* nesting_info; +}; + +// buffers only used in the decode kernel. separated from page_state_s to keep +// shared memory usage in other kernels (eg, gpuComputePageSizes) down. +struct page_state_buffers_s { + uint32_t nz_idx[non_zero_buffer_size]; // circular buffer of non-null value positions + uint32_t dict_idx[non_zero_buffer_size]; // Dictionary index, boolean, or string offset values + uint32_t str_len[non_zero_buffer_size]; // String length for plain encoding of strings +}; + +// Copies null counts back to `nesting_decode` at the end of scope +struct null_count_back_copier { + page_state_s* s; + int t; + __device__ ~null_count_back_copier() + { + if (s->nesting_info != nullptr and s->nesting_info == s->nesting_decode_cache) { + int depth = 0; + while (depth < s->page.num_output_nesting_levels) { + int const thread_depth = depth + t; + if (thread_depth < s->page.num_output_nesting_levels) { + s->page.nesting_decode[thread_depth].null_count = + s->nesting_decode_cache[thread_depth].null_count; + } + depth += blockDim.x; + } + } + } +}; + +/** + * @brief Test if the given page is in a string column + */ +constexpr bool is_string_col(PageInfo const& page, device_span chunks) +{ + if (page.flags & PAGEINFO_FLAGS_DICTIONARY != 0) { return false; } + auto const& col = chunks[page.chunk_idx]; + return is_string_col(col); +} + +/** + * @brief Returns whether or not a page spans either the beginning or the end of the + * specified row bounds + * + * @param s The page to be checked + * @param start_row The starting row index + * @param num_rows The number of rows + * @param has_repetition True if the schema has nesting + * + * @return True if the page spans the beginning or the end of the row bounds + */ +inline __device__ bool is_bounds_page(page_state_s* const s, + size_t start_row, + size_t num_rows, + bool has_repetition) +{ + size_t const page_begin = s->col.start_row + s->page.chunk_row; + size_t const page_end = page_begin + s->page.num_rows; + size_t const begin = start_row; + size_t const end = start_row + num_rows; + + // for non-nested schemas, rows cannot span pages, so use a more restrictive test + return has_repetition + ? ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end)) + : ((page_begin < begin && page_end > begin) || (page_begin < end && page_end > end)); +} + +/** + * @brief Returns whether or not a page is completely contained within the specified + * row bounds + * + * @param s The page to be checked + * @param start_row The starting row index + * @param num_rows The number of rows + * + * @return True if the page is completely contained within the row bounds + */ +inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row, size_t num_rows) +{ + size_t const page_begin = s->col.start_row + s->page.chunk_row; + size_t const page_end = page_begin + s->page.num_rows; + size_t const begin = start_row; + size_t const end = start_row + num_rows; + + return page_begin >= begin && page_end <= end; +} + +/** + * @brief Retrieves string information for a string at the specified source position + * + * @param[in] s Page state input + * @param[out] sb Page state buffer output + * @param[in] src_pos Source position + * + * @return A pair containing a pointer to the string and its length + */ +inline __device__ cuda::std::pair gpuGetStringData( + page_state_s volatile* s, page_state_buffers_s volatile* sb, int src_pos) +{ + char const* ptr = nullptr; + size_t len = 0; + + if (s->dict_base) { + // String dictionary + uint32_t dict_pos = + (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) : 0; + if (dict_pos < (uint32_t)s->dict_size) { + auto const* src = reinterpret_cast(s->dict_base + dict_pos); + ptr = src->first; + len = src->second; + } + } else { + // Plain encoding + uint32_t dict_pos = sb->dict_idx[rolling_index(src_pos)]; + if (dict_pos <= (uint32_t)s->dict_size) { + ptr = reinterpret_cast(s->data_start + dict_pos); + len = sb->str_len[rolling_index(src_pos)]; + } + } + + return {ptr, len}; +} + +/** + * @brief Performs RLE decoding of dictionary indexes + * + * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output + * @param[in] target_pos Target index position in dict_idx buffer (may exceed this value by up to + * 31) + * @param[in] t Warp1 thread ID (0..31) + * + * @return A pair containing the new output position, and the total length of strings decoded (this + * will only be valid on thread 0 and if sizes_only is true). In the event that this function + * decodes strings beyond target_pos, the total length of strings returned will include these + * additional values. + */ +template +__device__ cuda::std::pair gpuDecodeDictionaryIndices( + page_state_s volatile* s, + [[maybe_unused]] page_state_buffers_s volatile* sb, + int target_pos, + int t) +{ + uint8_t const* end = s->data_end; + int dict_bits = s->dict_bits; + int pos = s->dict_pos; + int str_len = 0; + + while (pos < target_pos) { + int is_literal, batch_len; + if (!t) { + uint32_t run = s->dict_run; + uint8_t const* cur = s->data_start; + if (run <= 1) { + run = (cur < end) ? get_vlq32(cur, end) : 0; + if (!(run & 1)) { + // Repeated value + int bytecnt = (dict_bits + 7) >> 3; + if (cur + bytecnt <= end) { + int32_t run_val = cur[0]; + if (bytecnt > 1) { + run_val |= cur[1] << 8; + if (bytecnt > 2) { + run_val |= cur[2] << 16; + if (bytecnt > 3) { run_val |= cur[3] << 24; } + } + } + s->dict_val = run_val & ((1 << dict_bits) - 1); + } + cur += bytecnt; + } + } + if (run & 1) { + // Literal batch: must output a multiple of 8, except for the last batch + int batch_len_div8; + batch_len = max(min(32, (int)(run >> 1) * 8), 1); + batch_len_div8 = (batch_len + 7) >> 3; + run -= batch_len_div8 * 2; + cur += batch_len_div8 * dict_bits; + } else { + batch_len = max(min(32, (int)(run >> 1)), 1); + run -= batch_len * 2; + } + s->dict_run = run; + s->data_start = cur; + is_literal = run & 1; + __threadfence_block(); + } + __syncwarp(); + is_literal = shuffle(is_literal); + batch_len = shuffle(batch_len); + + // compute dictionary index. + int dict_idx = 0; + if (t < batch_len) { + dict_idx = s->dict_val; + if (is_literal) { + int32_t ofs = (t - ((batch_len + 7) & ~7)) * dict_bits; + uint8_t const* p = s->data_start + (ofs >> 3); + ofs &= 7; + if (p < end) { + uint32_t c = 8 - ofs; + dict_idx = (*p++) >> ofs; + if (c < dict_bits && p < end) { + dict_idx |= (*p++) << c; + c += 8; + if (c < dict_bits && p < end) { + dict_idx |= (*p++) << c; + c += 8; + if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; } + } + } + dict_idx &= (1 << dict_bits) - 1; + } + } + + // if we're not computing sizes, store off the dictionary index + if constexpr (!sizes_only) { sb->dict_idx[rolling_index(pos + t)] = dict_idx; } + } + + // if we're computing sizes, add the length(s) + if constexpr (sizes_only) { + int const len = [&]() { + if (t >= batch_len || (pos + t >= target_pos)) { return 0; } + uint32_t const dict_pos = (s->dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0; + if (dict_pos < (uint32_t)s->dict_size) { + const auto* src = reinterpret_cast(s->dict_base + dict_pos); + return src->second; + } + return 0; + }(); + + using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage temp_storage; + // note: str_len will only be valid on thread 0. + str_len += WarpReduce(temp_storage).Sum(len); + } + + pos += batch_len; + } + return {pos, str_len}; +} + +/** + * @brief Performs RLE decoding of dictionary indexes, for when dict_size=1 + * + * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output + * @param[in] target_pos Target write position + * @param[in] t Thread ID + * + * @return The new output position + */ +inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s, + page_state_buffers_s volatile* sb, + int target_pos, + int t) +{ + uint8_t const* end = s->data_end; + int pos = s->dict_pos; + + while (pos < target_pos) { + int is_literal, batch_len; + if (!t) { + uint32_t run = s->dict_run; + uint8_t const* cur = s->data_start; + if (run <= 1) { + run = (cur < end) ? get_vlq32(cur, end) : 0; + if (!(run & 1)) { + // Repeated value + s->dict_val = (cur < end) ? cur[0] & 1 : 0; + cur++; + } + } + if (run & 1) { + // Literal batch: must output a multiple of 8, except for the last batch + int batch_len_div8; + batch_len = max(min(32, (int)(run >> 1) * 8), 1); + if (batch_len >= 8) { batch_len &= ~7; } + batch_len_div8 = (batch_len + 7) >> 3; + run -= batch_len_div8 * 2; + cur += batch_len_div8; + } else { + batch_len = max(min(32, (int)(run >> 1)), 1); + run -= batch_len * 2; + } + s->dict_run = run; + s->data_start = cur; + is_literal = run & 1; + __threadfence_block(); + } + __syncwarp(); + is_literal = shuffle(is_literal); + batch_len = shuffle(batch_len); + if (t < batch_len) { + int dict_idx; + if (is_literal) { + int32_t ofs = t - ((batch_len + 7) & ~7); + uint8_t const* p = s->data_start + (ofs >> 3); + dict_idx = (p < end) ? (p[0] >> (ofs & 7u)) & 1 : 0; + } else { + dict_idx = s->dict_val; + } + sb->dict_idx[rolling_index(pos + t)] = dict_idx; + } + pos += batch_len; + } + return pos; +} + +/** + * @brief Parses the length and position of strings and returns total length of all strings + * processed + * + * @param[in,out] s Page state input/output + * @param[out] sb Page state buffer output + * @param[in] target_pos Target output position + * @param[in] t Thread ID + * + * @return Total length of strings processed + */ +template +__device__ size_type gpuInitStringDescriptors(page_state_s volatile* s, + [[maybe_unused]] page_state_buffers_s volatile* sb, + int target_pos, + int t) +{ + int pos = s->dict_pos; + int total_len = 0; + + // This step is purely serial + if (!t) { + uint8_t const* cur = s->data_start; + int dict_size = s->dict_size; + int k = s->dict_val; + + while (pos < target_pos) { + int len; + if (k + 4 <= dict_size) { + len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24); + k += 4; + if (k + len > dict_size) { len = 0; } + } else { + len = 0; + } + if constexpr (!sizes_only) { + sb->dict_idx[rolling_index(pos)] = k; + sb->str_len[rolling_index(pos)] = len; + } + k += len; + total_len += len; + pos++; + } + s->dict_val = k; + __threadfence_block(); + } + + return total_len; +} + +/** + * @brief Decode values out of a definition or repetition stream + * + * @param[in,out] s Page state input/output + * @param[in] t target_count Target count of stream values on output + * @param[in] t Warp0 thread ID (0..31) + * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION + */ +template +__device__ void gpuDecodeStream( + level_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl) +{ + uint8_t const* cur_def = s->lvl_start[lvl]; + uint8_t const* end = s->lvl_end; + uint32_t level_run = s->initial_rle_run[lvl]; + int32_t level_val = s->initial_rle_value[lvl]; + int level_bits = s->col.level_bits[lvl]; + int32_t num_input_values = s->num_input_values; + int32_t value_count = s->lvl_count[lvl]; + int32_t batch_coded_count = 0; + + while (value_count < target_count && value_count < num_input_values) { + int batch_len; + if (level_run <= 1) { + // Get a new run symbol from the byte stream + int sym_len = 0; + if (!t) { + uint8_t const* cur = cur_def; + if (cur < end) { level_run = get_vlq32(cur, end); } + if (!(level_run & 1)) { + if (cur < end) level_val = cur[0]; + cur++; + if (level_bits > 8) { + if (cur < end) level_val |= cur[0] << 8; + cur++; + } + } + if (cur > end || level_run <= 1) { s->error = 0x10; } + sym_len = (int32_t)(cur - cur_def); + __threadfence_block(); + } + sym_len = shuffle(sym_len); + level_val = shuffle(level_val); + level_run = shuffle(level_run); + cur_def += sym_len; + } + if (s->error) { break; } + + batch_len = min(num_input_values - value_count, 32); + if (level_run & 1) { + // Literal run + int batch_len8; + batch_len = min(batch_len, (level_run >> 1) * 8); + batch_len8 = (batch_len + 7) >> 3; + if (t < batch_len) { + int bitpos = t * level_bits; + uint8_t const* cur = cur_def + (bitpos >> 3); + bitpos &= 7; + if (cur < end) level_val = cur[0]; + cur++; + if (level_bits > 8 - bitpos && cur < end) { + level_val |= cur[0] << 8; + cur++; + if (level_bits > 16 - bitpos && cur < end) level_val |= cur[0] << 16; + } + level_val = (level_val >> bitpos) & ((1 << level_bits) - 1); + } + level_run -= batch_len8 * 2; + cur_def += batch_len8 * level_bits; + } else { + // Repeated value + batch_len = min(batch_len, level_run >> 1); + level_run -= batch_len * 2; + } + if (t < batch_len) { + int idx = value_count + t; + output[rolling_index(idx)] = level_val; + } + batch_coded_count += batch_len; + value_count += batch_len; + } + + // update the stream info + if (!t) { + s->lvl_start[lvl] = cur_def; + s->initial_rle_run[lvl] = level_run; + s->initial_rle_value[lvl] = level_val; + s->lvl_count[lvl] = value_count; + } +} + +/** + * @brief Store a validity mask containing value_count bits into the output validity buffer of the + * page. + * + * @param[in,out] nesting_info The page/nesting information to store the mask in. The validity map + * offset is also updated + * @param[in] valid_mask The validity mask to be stored + * @param[in] value_count # of bits in the validity mask + */ +inline __device__ void store_validity(PageNestingDecodeInfo* nesting_info, + uint32_t valid_mask, + int32_t value_count) +{ + int word_offset = nesting_info->valid_map_offset / 32; + int bit_offset = nesting_info->valid_map_offset % 32; + // if we fit entirely in the output word + if (bit_offset + value_count <= 32) { + auto relevant_mask = static_cast((static_cast(1) << value_count) - 1); + + if (relevant_mask == ~0) { + nesting_info->valid_map[word_offset] = valid_mask; + } else { + atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset)); + atomicOr(nesting_info->valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset); + } + } + // we're going to spill over into the next word. + // note : writing both values here is the lazy/slow way. we could be writing just + // the first word and rolling the remaining bits over into the next call. + // however, some basic performance tests shows almost no difference between these two + // methods. More detailed performance testing might be worthwhile here. + else { + uint32_t bits_left = 32 - bit_offset; + + // first word. strip bits_left bits off the beginning and store that + uint32_t relevant_mask = ((1 << bits_left) - 1); + uint32_t mask_word0 = valid_mask & relevant_mask; + atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset)); + atomicOr(nesting_info->valid_map + word_offset, mask_word0 << bit_offset); + + // second word. strip the remainder of the bits off the end and store that + relevant_mask = ((1 << (value_count - bits_left)) - 1); + uint32_t mask_word1 = valid_mask & (relevant_mask << bits_left); + atomicAnd(nesting_info->valid_map + word_offset + 1, ~(relevant_mask)); + atomicOr(nesting_info->valid_map + word_offset + 1, mask_word1 >> bits_left); + } + + nesting_info->valid_map_offset += value_count; +} + +/** + * @brief Compute the nesting bounds within the hierarchy to add values to, and the definition level + * D to which we should considered them null or not. + * + * @param[out] start_depth The start nesting depth + * @param[out] end_depth The end nesting depth (inclusive) + * @param[out] d The definition level up to which added values are not-null. if t is out of bounds, + * d will be -1 + * @param[in] s Local page information + * @param[in] rep Repetition level buffer + * @param[in] def Definition level buffer + * @param[in] input_value_count The current count of input level values we have processed + * @param[in] target_input_value_count The desired # of input level values we want to process + * @param[in] t Thread index + */ +template +inline __device__ void get_nesting_bounds(int& start_depth, + int& end_depth, + int& d, + page_state_s* s, + level_t const* const rep, + level_t const* const def, + int input_value_count, + int32_t target_input_value_count, + int t) +{ + start_depth = -1; + end_depth = -1; + d = -1; + if (input_value_count + t < target_input_value_count) { + int const index = rolling_lvl_index(input_value_count + t); + d = static_cast(def[index]); + // if we have repetition (there are list columns involved) we have to + // bound what nesting levels we apply values to + if (s->col.max_level[level_type::REPETITION] > 0) { + int r = rep[index]; + start_depth = s->nesting_info[r].start_depth; + end_depth = s->nesting_info[d].end_depth; + } + // for columns without repetition (even ones involving structs) we always + // traverse the entire hierarchy. + else { + start_depth = 0; + end_depth = s->col.max_nesting_depth - 1; + } + } +} + +/** + * @brief Process a batch of incoming repetition/definition level values and generate + * validity, nested column offsets (where appropriate) and decoding indices. + * + * @param[in] target_input_value_count The # of repetition/definition levels to process up to + * @param[in] s Local page information + * @param[out] sb Page state buffer output + * @param[in] rep Repetition level buffer + * @param[in] def Definition level buffer + * @param[in] t Thread index + */ +template +__device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count, + page_state_s* s, + page_state_buffers_s* sb, + level_t const* const rep, + level_t const* const def, + int t) +{ + // max nesting depth of the column + int const max_depth = s->col.max_nesting_depth; + bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; + // how many (input) values we've processed in the page so far + int input_value_count = s->input_value_count; + // how many rows we've processed in the page so far + int input_row_count = s->input_row_count; + + PageNestingDecodeInfo* nesting_info_base = s->nesting_info; + + // process until we've reached the target + while (input_value_count < target_input_value_count) { + // determine the nesting bounds for this thread (the range of nesting depths we + // will generate new value indices and validity bits for) + int start_depth, end_depth, d; + get_nesting_bounds( + start_depth, end_depth, d, s, rep, def, input_value_count, target_input_value_count, t); + + // 4 interesting things to track: + // thread_value_count : # of output values from the view of this thread + // warp_value_count : # of output values for the whole warp + // + // thread_valid_count : # of valid values from the view of this thread + // warp_valid_count : # of valid values for the whole warp + uint32_t thread_value_count, warp_value_count; + uint32_t thread_valid_count, warp_valid_count; + + // track (page-relative) row index for the thread so we can compare against input bounds + // keep track of overall # of rows we've read. + int const is_new_row = start_depth == 0 ? 1 : 0; + uint32_t const warp_row_count_mask = ballot(is_new_row); + int32_t const thread_row_index = + input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1); + input_row_count += __popc(warp_row_count_mask); + // is this thread within read row bounds? + int const in_row_bounds = thread_row_index >= s->row_index_lower_bound && + thread_row_index < (s->first_row + s->num_rows) + ? 1 + : 0; + + // compute warp and thread value counts + uint32_t const warp_count_mask = + ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0); + + warp_value_count = __popc(warp_count_mask); + // Note : ((1 << t) - 1) implies "for all threads before me" + thread_value_count = __popc(warp_count_mask & ((1 << t) - 1)); + + // walk from 0 to max_depth + uint32_t next_thread_value_count, next_warp_value_count; + for (int s_idx = 0; s_idx < max_depth; s_idx++) { + PageNestingDecodeInfo* nesting_info = &nesting_info_base[s_idx]; + + // if we are within the range of nesting levels we should be adding value indices for + int const in_nesting_bounds = + ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0; + + // everything up to the max_def_level is a non-null value + uint32_t const is_valid = d >= nesting_info->max_def_level && in_nesting_bounds ? 1 : 0; + + // compute warp and thread valid counts + uint32_t const warp_valid_mask = + // for flat schemas, a simple ballot_sync gives us the correct count and bit positions + // because every value in the input matches to a value in the output + !has_repetition + ? ballot(is_valid) + : + // for nested schemas, it's more complicated. This warp will visit 32 incoming values, + // however not all of them will necessarily represent a value at this nesting level. so + // the validity bit for thread t might actually represent output value t-6. the correct + // position for thread t's bit is cur_value_count. for cuda 11 we could use + // __reduce_or_sync(), but until then we have to do a warp reduce. + WarpReduceOr32(is_valid << thread_value_count); + + thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1)); + warp_valid_count = __popc(warp_valid_mask); + + // if this is the value column emit an index for value decoding + if (is_valid && s_idx == max_depth - 1) { + int const src_pos = nesting_info->valid_count + thread_valid_count; + int const dst_pos = nesting_info->value_count + thread_value_count; + // nz_idx is a mapping of src buffer indices to destination buffer indices + sb->nz_idx[rolling_index(src_pos)] = dst_pos; + } + + // compute warp and thread value counts for the -next- nesting level. we need to + // do this for nested schemas so that we can emit an offset for the -current- nesting + // level. more concretely : the offset for the current nesting level == current length of the + // next nesting level + if (s_idx < max_depth - 1) { + uint32_t const next_warp_count_mask = + ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0); + next_warp_value_count = __popc(next_warp_count_mask); + next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1)); + + // if we're -not- at a leaf column and we're within nesting/row bounds + // and we have a valid data_out pointer, it implies this is a list column, so + // emit an offset. + if (in_nesting_bounds && nesting_info->data_out != nullptr) { + int const idx = nesting_info->value_count + thread_value_count; + cudf::size_type const ofs = nesting_info_base[s_idx + 1].value_count + + next_thread_value_count + + nesting_info_base[s_idx + 1].page_start_value; + (reinterpret_cast(nesting_info->data_out))[idx] = ofs; + } + } + + // nested schemas always read and write to the same bounds (that is, read and write positions + // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the + // first value, even if that is before first_row, because we cannot trivially jump to + // the correct position to start reading. since we are about to write the validity vector here + // we need to adjust our computed mask to take into account the write row bounds. + int const in_write_row_bounds = + !has_repetition + ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows) + : in_row_bounds; + int const first_thread_in_write_range = + !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0; + + // # of bits to of the validity mask to write out + int const warp_valid_mask_bit_count = + first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range; + + // increment count of valid values, count of total values, and update validity mask + if (!t) { + if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) { + uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range; + store_validity(nesting_info, warp_output_valid_mask, warp_valid_mask_bit_count); + + nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask); + } + nesting_info->valid_count += warp_valid_count; + nesting_info->value_count += warp_value_count; + } + + // propagate value counts for the next level + warp_value_count = next_warp_value_count; + thread_value_count = next_thread_value_count; + } + + input_value_count += min(32, (target_input_value_count - input_value_count)); + __syncwarp(); + } + + // update + if (!t) { + // update valid value count for decoding and total # of values we've processed + s->nz_count = nesting_info_base[max_depth - 1].valid_count; + s->input_value_count = input_value_count; + s->input_row_count = input_row_count; + } +} + +/** + * @brief Process repetition and definition levels up to the target count of leaf values. + * + * In order to decode actual leaf values from the input stream, we need to generate the + * list of non-null value positions (page_state_s::nz_idx). We do this by processing + * the repetition and definition level streams. This process also generates validity information, + * and offset column values in the case of nested schemas. Because of the way the streams + * are encoded, this function may generate slightly more than target_leaf_count. + * + * Only runs on 1 warp. + * + * @param[in] s The local page state + * @param[out] sb Page state buffer output + * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for + * @param[in] rep Repetition level buffer + * @param[in] def Definition level buffer + * @param[in] t Thread index + */ +template +__device__ void gpuDecodeLevels(page_state_s* s, + page_state_buffers_s* sb, + int32_t target_leaf_count, + level_t* const rep, + level_t* const def, + int t) +{ + bool has_repetition = s->col.max_level[level_type::REPETITION] > 0; + + constexpr int batch_size = 32; + int cur_leaf_count = target_leaf_count; + while (!s->error && s->nz_count < target_leaf_count && + s->input_value_count < s->num_input_values) { + if (has_repetition) { gpuDecodeStream(rep, s, cur_leaf_count, t, level_type::REPETITION); } + gpuDecodeStream(def, s, cur_leaf_count, t, level_type::DEFINITION); + __syncwarp(); + + // because the rep and def streams are encoded separately, we cannot request an exact + // # of values to be decoded at once. we can only process the lowest # of decoded rep/def + // levels we get. + int actual_leaf_count = has_repetition ? min(s->lvl_count[level_type::REPETITION], + s->lvl_count[level_type::DEFINITION]) + : s->lvl_count[level_type::DEFINITION]; + + // process what we got back + gpuUpdateValidityOffsetsAndRowIndices( + actual_leaf_count, s, sb, rep, def, t); + cur_leaf_count = actual_leaf_count + batch_size; + __syncwarp(); + } +} + +/** + * @brief Parse the beginning of the level section (definition or repetition), + * initializes the initial RLE run & value, and returns the section length + * + * @param[in,out] s The page state + * @param[in] cur The current data position + * @param[in] end The end of the data + * @param[in] level_bits The bits required + * @param[in] is_decode_step True if we are performing the decode step. + * @param[in,out] decoders The repetition and definition level stream decoders + * + * @return The length of the section + */ +inline __device__ uint32_t InitLevelSection(page_state_s* s, + uint8_t const* cur, + uint8_t const* end, + level_type lvl) +{ + int32_t len; + int level_bits = s->col.level_bits[lvl]; + Encoding encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding + : s->page.repetition_level_encoding; + + auto start = cur; + if (level_bits == 0) { + len = 0; + s->initial_rle_run[lvl] = s->page.num_input_values * 2; // repeated value + s->initial_rle_value[lvl] = 0; + s->lvl_start[lvl] = cur; + s->abs_lvl_start[lvl] = cur; + } else if (encoding == Encoding::RLE) { + // V2 only uses RLE encoding, so only perform check here + if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) { + len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes; + } else if (cur + 4 < end) { + len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24); + cur += 4; + } else { + len = 0; + s->error = 2; + } + s->abs_lvl_start[lvl] = cur; + if (!s->error) { + uint32_t run = get_vlq32(cur, end); + s->initial_rle_run[lvl] = run; + if (!(run & 1)) { + int v = (cur < end) ? cur[0] : 0; + cur++; + if (level_bits > 8) { + v |= ((cur < end) ? cur[0] : 0) << 8; + cur++; + } + s->initial_rle_value[lvl] = v; + } + s->lvl_start[lvl] = cur; + } + + if (cur > end) { s->error = 2; } + } else if (encoding == Encoding::BIT_PACKED) { + len = (s->page.num_input_values * level_bits + 7) >> 3; + s->initial_rle_run[lvl] = ((s->page.num_input_values + 7) >> 3) * 2 + 1; // literal run + s->initial_rle_value[lvl] = 0; + s->lvl_start[lvl] = cur; + s->abs_lvl_start[lvl] = cur; + } else { + s->error = 3; + len = 0; + } + + s->abs_lvl_end[lvl] = start + len; + + return static_cast(len); +} + +/** + * @brief Functor for setupLocalPageInfo that always returns true. + */ +struct all_types_filter { + __device__ inline bool operator()(PageInfo const& page) { return true; } +}; + +/** + * @brief Functor for setupLocalPageInfo that returns true if this is not a string column. + */ +struct non_string_filter { + device_span chunks; + + __device__ inline bool operator()(PageInfo const& page) { return !is_string_col(page, chunks); } +}; + +/** + * @brief Functor for setupLocalPageInfo that returns true if this is a string column. + */ +struct string_filter { + device_span chunks; + + __device__ inline bool operator()(PageInfo const& page) { return is_string_col(page, chunks); } +}; + +/** + * @brief Sets up block-local page state information from the global pages. + * + * @param[in, out] s The local page state to be filled in + * @param[in] p The global page to be copied from + * @param[in] chunks The global list of chunks + * @param[in] min_row Crop all rows below min_row + * @param[in] num_rows Maximum number of rows to read + * @param[in] filter Filtering function used to decide which pages to operate on + * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess) + * @param[in] decoders rle_stream decoders which will be used for decoding levels. Optional. + * @tparam Filter Function that takes a PageInfo reference and returns true if the given page should + * be operated on Currently only used by gpuComputePageSizes step) + */ +template +inline __device__ bool setupLocalPageInfo(page_state_s* const s, + PageInfo const* p, + device_span chunks, + size_t min_row, + size_t num_rows, + Filter filter, + bool is_decode_step) +{ + int t = threadIdx.x; + + // Fetch page info + if (!t) { + s->page = *p; + s->nesting_info = nullptr; + s->col = chunks[s->page.chunk_idx]; + } + __syncthreads(); + + // return false if this is a dictionary page or it does not pass the filter condition + if ((s->page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0 || !filter(s->page)) { return false; } + + // our starting row (absolute index) is + // col.start_row == absolute row index + // page.chunk-row == relative row index within the chunk + size_t const page_start_row = s->col.start_row + s->page.chunk_row; + + // if we can use the nesting decode cache, set it up now + auto const can_use_decode_cache = s->page.nesting_info_size <= max_cacheable_nesting_decode_info; + if (can_use_decode_cache) { + int depth = 0; + while (depth < s->page.nesting_info_size) { + int const thread_depth = depth + t; + if (thread_depth < s->page.nesting_info_size) { + // these values need to be copied over from global + s->nesting_decode_cache[thread_depth].max_def_level = + s->page.nesting_decode[thread_depth].max_def_level; + s->nesting_decode_cache[thread_depth].page_start_value = + s->page.nesting_decode[thread_depth].page_start_value; + s->nesting_decode_cache[thread_depth].start_depth = + s->page.nesting_decode[thread_depth].start_depth; + s->nesting_decode_cache[thread_depth].end_depth = + s->page.nesting_decode[thread_depth].end_depth; + } + depth += blockDim.x; + } + } + + if (!t) { + s->nesting_info = can_use_decode_cache ? s->nesting_decode_cache : s->page.nesting_decode; + + // NOTE: s->page.num_rows, s->col.chunk_row, s->first_row and s->num_rows will be + // invalid/bogus during first pass of the preprocess step for nested types. this is ok + // because we ignore these values in that stage. + auto const max_row = min_row + num_rows; + + // if we are totally outside the range of the input, do nothing + if ((page_start_row > max_row) || (page_start_row + s->page.num_rows < min_row)) { + s->first_row = 0; + s->num_rows = 0; + } + // otherwise + else { + s->first_row = page_start_row >= min_row ? 0 : min_row - page_start_row; + auto const max_page_rows = s->page.num_rows - s->first_row; + s->num_rows = (page_start_row + s->first_row) + max_page_rows <= max_row + ? max_page_rows + : max_row - (page_start_row + s->first_row); + } + } + + __syncthreads(); + + // zero counts + int depth = 0; + while (depth < s->page.num_output_nesting_levels) { + int const thread_depth = depth + t; + if (thread_depth < s->page.num_output_nesting_levels) { + s->nesting_info[thread_depth].valid_count = 0; + s->nesting_info[thread_depth].value_count = 0; + s->nesting_info[thread_depth].null_count = 0; + } + depth += blockDim.x; + } + __syncthreads(); + + // if we have no work to do (eg, in a skip_rows/num_rows case) in this page. + // + // corner case: in the case of lists, we can have pages that contain "0" rows if the current row + // starts before this page and ends after this page: + // P0 P1 P2 + // |---------|---------|----------| + // ^------------------^ + // row start row end + // P1 will contain 0 rows + // + // NOTE: this check needs to be done after the null counts have been zeroed out + bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; + if (is_decode_step && s->num_rows == 0 && + !(has_repetition && (is_bounds_page(s, min_row, num_rows, has_repetition) || + is_page_contained(s, min_row, num_rows)))) { + return false; + } + + if (!t) { + s->error = 0; + + // IMPORTANT : nested schemas can have 0 rows in a page but still have + // values. The case is: + // - On page N-1, the last row starts, with 2/6 values encoded + // - On page N, the remaining 4/6 values are encoded, but there are no new rows. + // if (s->page.num_input_values > 0 && s->page.num_rows > 0) { + if (s->page.num_input_values > 0) { + uint8_t* cur = s->page.page_data; + uint8_t* end = cur + s->page.uncompressed_page_size; + + uint32_t dtype_len_out = s->col.data_type >> 3; + s->ts_scale = 0; + // Validate data type + auto const data_type = s->col.data_type & 7; + switch (data_type) { + case BOOLEAN: + s->dtype_len = 1; // Boolean are stored as 1 byte on the output + break; + case INT32: [[fallthrough]]; + case FLOAT: s->dtype_len = 4; break; + case INT64: + if (s->col.ts_clock_rate) { + int32_t units = 0; + // Duration types are not included because no scaling is done when reading + if (s->col.converted_type == TIMESTAMP_MILLIS) { + units = cudf::timestamp_ms::period::den; + } else if (s->col.converted_type == TIMESTAMP_MICROS) { + units = cudf::timestamp_us::period::den; + } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) { + units = cudf::timestamp_ns::period::den; + } + if (units and units != s->col.ts_clock_rate) { + s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate) + : (s->col.ts_clock_rate / units); + } + } + [[fallthrough]]; + case DOUBLE: s->dtype_len = 8; break; + case INT96: s->dtype_len = 12; break; + case BYTE_ARRAY: + if (s->col.converted_type == DECIMAL) { + auto const decimal_precision = s->col.decimal_precision; + s->dtype_len = [decimal_precision]() { + if (decimal_precision <= MAX_DECIMAL32_PRECISION) { + return sizeof(int32_t); + } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) { + return sizeof(int64_t); + } else { + return sizeof(__int128_t); + } + }(); + } else { + s->dtype_len = sizeof(string_index_pair); + } + break; + default: // FIXED_LEN_BYTE_ARRAY: + s->dtype_len = dtype_len_out; + s->error |= (s->dtype_len <= 0); + break; + } + // Special check for downconversions + s->dtype_len_in = s->dtype_len; + if (s->col.converted_type == DECIMAL && data_type == FIXED_LEN_BYTE_ARRAY) { + s->dtype_len = [dtype_len = s->dtype_len]() { + if (dtype_len <= sizeof(int32_t)) { + return sizeof(int32_t); + } else if (dtype_len <= sizeof(int64_t)) { + return sizeof(int64_t); + } else { + return sizeof(__int128_t); + } + }(); + } else if (data_type == INT32) { + if (dtype_len_out == 1) { + // INT8 output + s->dtype_len = 1; + } else if (dtype_len_out == 2) { + // INT16 output + s->dtype_len = 2; + } else if (s->col.converted_type == TIME_MILLIS) { + // INT64 output + s->dtype_len = 8; + } + } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) { + s->dtype_len = 4; // HASH32 output + } else if (data_type == INT96) { + s->dtype_len = 8; // Convert to 64-bit timestamp + } + + // during the decoding step we need to offset the global output buffers + // for each level of nesting so that we write to the section this page + // is responsible for. + // - for flat schemas, we can do this directly by using row counts + // - for nested schemas, these offsets are computed during the preprocess step + // + // NOTE: in a chunked read situation, s->col.column_data_base and s->col.valid_map_base + // will be aliased to memory that has been freed when we get here in the non-decode step, so + // we cannot check against nullptr. we'll just check a flag directly. + if (is_decode_step) { + int max_depth = s->col.max_nesting_depth; + for (int idx = 0; idx < max_depth; idx++) { + PageNestingDecodeInfo* nesting_info = &s->nesting_info[idx]; + + size_t output_offset; + // schemas without lists + if (s->col.max_level[level_type::REPETITION] == 0) { + output_offset = page_start_row >= min_row ? page_start_row - min_row : 0; + } + // for schemas with lists, we've already got the exact value precomputed + else { + output_offset = nesting_info->page_start_value; + } + + if (s->col.column_data_base != nullptr) { + nesting_info->data_out = static_cast(s->col.column_data_base[idx]); + if (s->col.column_string_base != nullptr) { + nesting_info->string_out = static_cast(s->col.column_string_base[idx]); + } + + nesting_info->data_out = static_cast(s->col.column_data_base[idx]); + + if (nesting_info->data_out != nullptr) { + // anything below max depth with a valid data pointer must be a list, so the + // element size is the size of the offset type. + uint32_t len = idx < max_depth - 1 ? sizeof(cudf::size_type) : s->dtype_len; + // if this is a string column, then dtype_len is a lie. data will be offsets rather + // than (ptr,len) tuples. + if (data_type == BYTE_ARRAY && s->dtype_len != 4) { len = sizeof(cudf::size_type); } + nesting_info->data_out += (output_offset * len); + } + if (nesting_info->string_out != nullptr) { + nesting_info->string_out += s->page.str_offset; + } + nesting_info->valid_map = s->col.valid_map_base[idx]; + if (nesting_info->valid_map != nullptr) { + nesting_info->valid_map += output_offset >> 5; + nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f); + } + } + } + } + s->first_output_value = 0; + + // Find the compressed size of repetition levels + cur += InitLevelSection(s, cur, end, level_type::REPETITION); + // Find the compressed size of definition levels + cur += InitLevelSection(s, cur, end, level_type::DEFINITION); + + s->dict_bits = 0; + s->dict_base = nullptr; + s->dict_size = 0; + // NOTE: if additional encodings are supported in the future, modifications must + // be made to is_supported_encoding() in reader_impl_preprocess.cu + switch (s->page.encoding) { + case Encoding::PLAIN_DICTIONARY: + case Encoding::RLE_DICTIONARY: + // RLE-packed dictionary indices, first byte indicates index length in bits + if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) { + // String dictionary: use index + s->dict_base = reinterpret_cast(s->col.str_dict_index); + s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair); + } else { + s->dict_base = + s->col.page_info[0].page_data; // dictionary is always stored in the first page + s->dict_size = s->col.page_info[0].uncompressed_page_size; + } + s->dict_run = 0; + s->dict_val = 0; + s->dict_bits = (cur < end) ? *cur++ : 0; + if (s->dict_bits > 32 || !s->dict_base) { s->error = (10 << 8) | s->dict_bits; } + break; + case Encoding::PLAIN: + s->dict_size = static_cast(end - cur); + s->dict_val = 0; + if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; } + break; + case Encoding::RLE: s->dict_run = 0; break; + default: + s->error = 1; // Unsupported encoding + break; + } + if (cur > end) { s->error = 1; } + s->lvl_end = cur; + s->data_start = cur; + s->data_end = end; + } else { + s->error = 1; + } + + s->lvl_count[level_type::REPETITION] = 0; + s->lvl_count[level_type::DEFINITION] = 0; + s->nz_count = 0; + s->num_input_values = s->page.num_input_values; + s->dict_pos = 0; + s->src_pos = 0; + + // for flat hierarchies, we can't know how many leaf values to skip unless we do a full + // preprocess of the definition levels (since nulls will have no actual decodable value, there + // is no direct correlation between # of rows and # of decodable values). so we will start + // processing at the beginning of the value stream and disregard any indices that start + // before the first row. + if (s->col.max_level[level_type::REPETITION] == 0) { + s->page.skipped_values = 0; + s->page.skipped_leaf_values = 0; + s->input_value_count = 0; + s->input_row_count = 0; + s->input_leaf_count = 0; + + s->row_index_lower_bound = -1; + } + // for nested hierarchies, we have run a preprocess that lets us skip directly to the values + // we need to start decoding at + else { + // input_row_count translates to "how many rows we have processed so far", so since we are + // skipping directly to where we want to start decoding, set it to first_row + s->input_row_count = s->first_row; + + // return the lower bound to compare (page-relative) thread row index against. Explanation: + // In the case of nested schemas, rows can span page boundaries. That is to say, + // we can encounter the first value for row X on page M, but the last value for page M + // might not be the last value for row X. page M+1 (or further) may contain the last value. + // + // This means that the first values we encounter for a given page (M+1) may not belong to the + // row indicated by chunk_row, but to the row before it that spanned page boundaries. If that + // previous row is within the overall row bounds, include the values by allowing relative row + // index -1 + int const max_row = (min_row + num_rows) - 1; + if (min_row < page_start_row && max_row >= page_start_row - 1) { + s->row_index_lower_bound = -1; + } else { + s->row_index_lower_bound = s->first_row; + } + + // if we're in the decoding step, jump directly to the first + // value we care about + if (is_decode_step) { + s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0; + } else { + s->input_value_count = 0; + s->input_leaf_count = 0; + s->page.skipped_values = + -1; // magic number to indicate it hasn't been set for use inside UpdatePageSizes + s->page.skipped_leaf_values = 0; + } + } + + __threadfence_block(); + } + __syncthreads(); + + return true; +} + +} // namespace cudf::io::parquet::gpu diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu new file mode 100644 index 00000000000..9173d408192 --- /dev/null +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -0,0 +1,865 @@ +/* + * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "page_decode.cuh" + +#include +#include + +namespace cudf { +namespace io { +namespace parquet { +namespace gpu { + +namespace { + +// stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp. +// copies from src to dst in 16B chunks per thread. +__device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id) +{ + using cudf::detail::warp_size; + using cudf::strings::detail::load_uint4; + + constexpr size_t out_datatype_size = sizeof(uint4); + constexpr size_t in_datatype_size = sizeof(uint); + + auto const alignment_offset = reinterpret_cast(dst) % out_datatype_size; + uint4* out_chars_aligned = reinterpret_cast(dst - alignment_offset); + auto const in_start = src; + + // Both `out_start_aligned` and `out_end_aligned` are indices into `dst`. + // `out_start_aligned` is the first 16B aligned memory location after `dst + 4`. + // `out_end_aligned` is the last 16B aligned memory location before `len - 4`. Characters + // between `[out_start_aligned, out_end_aligned)` will be copied using uint4. + // `dst + 4` and `len - 4` are used instead of `dst` and `len` to avoid + // `load_uint4` reading beyond string boundaries. + // use signed int since out_end_aligned can be negative. + int64_t out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) / + out_datatype_size * out_datatype_size - + alignment_offset; + int64_t out_end_aligned = + (len - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size - + alignment_offset; + + for (int64_t ichar = out_start_aligned + lane_id * out_datatype_size; ichar < out_end_aligned; + ichar += warp_size * out_datatype_size) { + *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) = + load_uint4((const char*)in_start + ichar); + } + + // Tail logic: copy characters of the current string outside + // `[out_start_aligned, out_end_aligned)`. + if (out_end_aligned <= out_start_aligned) { + // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the + // entire string. + for (int64_t ichar = lane_id; ichar < len; ichar += warp_size) { + dst[ichar] = in_start[ichar]; + } + } else { + // Copy characters in range `[0, out_start_aligned)`. + if (lane_id < out_start_aligned) { dst[lane_id] = in_start[lane_id]; } + // Copy characters in range `[out_end_aligned, len)`. + int64_t ichar = out_end_aligned + lane_id; + if (ichar < len) { dst[ichar] = in_start[ichar]; } + } +} + +/** + * @brief char-parallel string copy. + */ +__device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id) +{ + using cudf::detail::warp_size; + if (len > 64) { + wideStrcpy(dst, src, len, lane_id); + } else { + for (int i = lane_id; i < len; i += warp_size) { + dst[i] = src[i]; + } + } +} + +/** + * @brief Perform exclusive scan on an array of any length using a single block of threads. + */ +template +__device__ void block_excl_sum(size_type* arr, size_type length, size_type initial_value) +{ + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage scan_storage; + int const t = threadIdx.x; + + // do a series of block sums, storing results in arr as we go + for (int pos = 0; pos < length; pos += block_size) { + int const tidx = pos + t; + size_type tval = tidx < length ? arr[tidx] : 0; + size_type block_sum; + block_scan(scan_storage).ExclusiveScan(tval, tval, initial_value, cub::Sum(), block_sum); + if (tidx < length) { arr[tidx] = tval; } + initial_value += block_sum; + } +} + +/** + * @brief Compute the start and end page value bounds for this page + * + * This uses definition and repetition level info to determine the number of valid and null + * values for the page, taking into account skip_rows/num_rows (if set). + * + * @param s The local page info + * @param min_row Row index to start reading at + * @param num_rows Maximum number of rows to read + * @param is_bounds_pg True if this page is clipped + * @param has_repetition True if the schema is nested + * @param decoders Definition and repetition level decoders + * @return pair containing start and end value indexes + * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels + * @tparam level_t Type used to store decoded repetition and definition levels + */ +template +__device__ thrust::pair page_bounds(page_state_s* const s, + size_t min_row, + size_t num_rows, + bool is_bounds_pg, + bool has_repetition, + rle_stream* decoders) +{ + using block_reduce = cub::BlockReduce; + using block_scan = cub::BlockScan; + __shared__ union { + typename block_reduce::TempStorage reduce_storage; + typename block_scan::TempStorage scan_storage; + } temp_storage; + + int const t = threadIdx.x; + + // decode batches of level stream data using rle_stream objects and use the results to + // calculate start and end value positions in the encoded string data. + int const max_depth = s->col.max_nesting_depth; + int const max_def = s->nesting_info[max_depth - 1].max_def_level; + + // can skip all this if we know there are no nulls + if (max_def == 0 && !is_bounds_pg) { + s->page.num_valids = s->num_input_values; + s->page.num_nulls = 0; + return {0, s->num_input_values}; + } + + int start_value = 0; + int end_value = s->page.num_input_values; + auto const pp = &s->page; + auto const col = &s->col; + + // initialize the stream decoders (requires values computed in setupLocalPageInfo) + int const max_batch_size = lvl_buf_size; + auto const def_decode = reinterpret_cast(pp->lvl_decode_buf[level_type::DEFINITION]); + auto const rep_decode = reinterpret_cast(pp->lvl_decode_buf[level_type::REPETITION]); + decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION], + s->abs_lvl_start[level_type::DEFINITION], + s->abs_lvl_end[level_type::DEFINITION], + max_batch_size, + def_decode, + s->page.num_input_values); + // only need repetition if this is a bounds page. otherwise all we need is def level info + // to count the nulls. + if (has_repetition && is_bounds_pg) { + decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION], + s->abs_lvl_start[level_type::REPETITION], + s->abs_lvl_end[level_type::REPETITION], + max_batch_size, + rep_decode, + s->page.num_input_values); + } + + int processed = 0; + + // if this is a bounds page, we need to do extra work to find the start and/or end value index + if (is_bounds_pg) { + __shared__ int skipped_values; + __shared__ int skipped_leaf_values; + __shared__ int last_input_value; + __shared__ int end_val_idx; + + // need these for skip_rows case + auto const page_start_row = col->start_row + pp->chunk_row; + auto const max_row = min_row + num_rows; + auto const begin_row = page_start_row >= min_row ? 0 : min_row - page_start_row; + auto const max_page_rows = pp->num_rows - begin_row; + auto const page_rows = page_start_row + begin_row + max_page_rows <= max_row + ? max_page_rows + : max_row - (page_start_row + begin_row); + auto end_row = begin_row + page_rows; + int row_fudge = -1; + + // short circuit for no nulls + if (max_def == 0 && !has_repetition) { return {begin_row, end_row}; } + + int row_count = 0; + int leaf_count = 0; + bool skipped_values_set = false; + bool end_value_set = false; + + while (processed < s->page.num_input_values) { + int start_val = processed; + + if (has_repetition) { + decoders[level_type::REPETITION].decode_next(t); + __syncthreads(); + + // special case where page does not begin at a row boundary + if (processed == 0 && rep_decode[0] != 0) { + if (t == 0) { + skipped_values = 0; + skipped_leaf_values = 0; + } + skipped_values_set = true; + end_row++; // need to finish off the previous row + row_fudge = 0; + } + } + + // the # of rep/def levels will always be the same size + processed += decoders[level_type::DEFINITION].decode_next(t); + __syncthreads(); + + // do something with the level data + while (start_val < processed) { + int idx_t = start_val + t; + int idx = rolling_lvl_index(idx_t); + + // get absolute thread row index + int is_new_row = idx_t < processed && (!has_repetition || rep_decode[idx] == 0); + int thread_row_count, block_row_count; + block_scan(temp_storage.scan_storage) + .InclusiveSum(is_new_row, thread_row_count, block_row_count); + __syncthreads(); + + // get absolute thread leaf index + int const is_new_leaf = idx_t < processed && (def_decode[idx] >= max_def); + int thread_leaf_count, block_leaf_count; + block_scan(temp_storage.scan_storage) + .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count); + __syncthreads(); + + // if we have not set skipped values yet, see if we found the first in-bounds row + if (!skipped_values_set && row_count + block_row_count > begin_row) { + // if this thread is in row bounds + int const row_index = thread_row_count + row_count - 1; + int const in_row_bounds = + idx_t < processed && (row_index >= begin_row) && (row_index < end_row); + + int local_count, global_count; + block_scan(temp_storage.scan_storage) + .InclusiveSum(in_row_bounds, local_count, global_count); + __syncthreads(); + + // we found it + if (global_count > 0) { + // this is the thread that represents the first row. need to test in_row_bounds for + // the case where we only want one row and local_count == 1 for many threads. + if (local_count == 1 && in_row_bounds) { + skipped_values = idx_t; + skipped_leaf_values = + leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count); + } + skipped_values_set = true; + } + } + + // test if row_count will exceed end_row in this batch + if (!end_value_set && row_count + block_row_count >= end_row) { + // if this thread exceeds row bounds. row_fudge change depending on whether we've faked + // the end row to account for starting a page in the middle of a row. + int const row_index = thread_row_count + row_count + row_fudge; + int const exceeds_row_bounds = row_index >= end_row; + + int local_count, global_count; + block_scan(temp_storage.scan_storage) + .InclusiveSum(exceeds_row_bounds, local_count, global_count); + __syncthreads(); + + // we found it + if (global_count > 0) { + // this is the thread that represents the end row. + if (local_count == 1) { + last_input_value = idx_t; + end_val_idx = leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count); + } + end_value_set = true; + break; + } + } + + row_count += block_row_count; + leaf_count += block_leaf_count; + + start_val += preprocess_block_size; + } + __syncthreads(); + if (end_value_set) { break; } + } + + start_value = skipped_values_set ? skipped_leaf_values : 0; + end_value = end_value_set ? end_val_idx : leaf_count; + + if (t == 0) { + int const v0 = skipped_values_set ? skipped_values : 0; + int const vn = end_value_set ? last_input_value : s->num_input_values; + int const total_values = vn - v0; + int const total_leaf_values = end_value - start_value; + int const num_nulls = total_values - total_leaf_values; + pp->num_nulls = num_nulls; + pp->num_valids = total_leaf_values; + } + } + // already filtered out unwanted pages, so need to count all non-null values in this page + else { + int num_nulls = 0; + while (processed < s->page.num_input_values) { + int start_val = processed; + processed += decoders[level_type::DEFINITION].decode_next(t); + __syncthreads(); + + while (start_val < processed) { + int idx_t = start_val + t; + if (idx_t < processed) { + int idx = rolling_lvl_index(idx_t); + if (def_decode[idx] < max_def) { num_nulls++; } + } + start_val += preprocess_block_size; + } + __syncthreads(); + } + + int const null_count = block_reduce(temp_storage.reduce_storage).Sum(num_nulls); + + if (t == 0) { + pp->num_nulls = null_count; + pp->num_valids = pp->num_input_values - null_count; + } + __syncthreads(); + + end_value -= pp->num_nulls; + } + + return {start_value, end_value}; +} + +/** + * @brief Compute string size information for dictionary encoded strings. + * + * @param data Pointer to the start of the page data stream + * @param dict_base Pointer to the start of the dictionary + * @param dict_bits The number of bits used to in the dictionary bit packing + * @param dict_size Size of the dictionary in bytes + * @param data_size Size of the page data in bytes + * @param start_value Do not count values that occur before this index + * @param end_value Do not count values that occur after this index + */ +__device__ size_t totalDictEntriesSize(uint8_t const* data, + uint8_t const* dict_base, + int dict_bits, + int dict_size, + int data_size, + int start_value, + int end_value) +{ + int const t = threadIdx.x; + uint8_t const* ptr = data; + uint8_t const* const end = data + data_size; + int const bytecnt = (dict_bits + 7) >> 3; + size_t l_str_len = 0; // partial sums across threads + int pos = 0; // current value index in the data stream + int t0 = 0; // thread 0 for this batch + + int dict_run = 0; + int dict_val = 0; + + while (pos < end_value && ptr <= end) { + if (dict_run <= 1) { + dict_run = (ptr < end) ? get_vlq32(ptr, end) : 0; + if (!(dict_run & 1)) { + // Repeated value + if (ptr + bytecnt <= end) { + int32_t run_val = ptr[0]; + if (bytecnt > 1) { + run_val |= ptr[1] << 8; + if (bytecnt > 2) { + run_val |= ptr[2] << 16; + if (bytecnt > 3) { run_val |= ptr[3] << 24; } + } + } + dict_val = run_val & ((1 << dict_bits) - 1); + } + ptr += bytecnt; + } + } + + int batch_len; + if (dict_run & 1) { + // Literal batch: must output a multiple of 8, except for the last batch + int batch_len_div8; + batch_len = max(min(preprocess_block_size, (int)(dict_run >> 1) * 8), 1); + batch_len_div8 = (batch_len + 7) >> 3; + dict_run -= batch_len_div8 * 2; + ptr += batch_len_div8 * dict_bits; + } else { + batch_len = dict_run >> 1; + dict_run = 0; + } + + int const is_literal = dict_run & 1; + + // calculate my thread id for this batch. way to round-robin the work. + int mytid = t - t0; + if (mytid < 0) mytid += preprocess_block_size; + + // compute dictionary index. + if (is_literal) { + int dict_idx = 0; + if (mytid < batch_len) { + dict_idx = dict_val; + int32_t ofs = (mytid - ((batch_len + 7) & ~7)) * dict_bits; + const uint8_t* p = ptr + (ofs >> 3); + ofs &= 7; + if (p < end) { + uint32_t c = 8 - ofs; + dict_idx = (*p++) >> ofs; + if (c < dict_bits && p < end) { + dict_idx |= (*p++) << c; + c += 8; + if (c < dict_bits && p < end) { + dict_idx |= (*p++) << c; + c += 8; + if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; } + } + } + dict_idx &= (1 << dict_bits) - 1; + } + + if (pos + mytid < end_value) { + uint32_t const dict_pos = (dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0; + if (pos + mytid >= start_value && dict_pos < (uint32_t)dict_size) { + const auto* src = reinterpret_cast(dict_base + dict_pos); + l_str_len += src->second; + } + } + } + + t0 += batch_len; + } else { + int const start_off = + (pos < start_value && pos + batch_len > start_value) ? start_value - pos : 0; + batch_len = min(batch_len, end_value - pos); + if (mytid == 0) { + uint32_t const dict_pos = (dict_bits > 0) ? dict_val * sizeof(string_index_pair) : 0; + if (pos + batch_len > start_value && dict_pos < (uint32_t)dict_size) { + const auto* src = reinterpret_cast(dict_base + dict_pos); + l_str_len += (batch_len - start_off) * src->second; + } + } + + t0 += 1; + } + + t0 = t0 % preprocess_block_size; + pos += batch_len; + } + __syncthreads(); + + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage reduce_storage; + size_t sum_l = block_reduce(reduce_storage).Sum(l_str_len); + + return sum_l; +} + +/** + * @brief Compute string size information for plain encoded strings. + * + * @param data Pointer to the start of the page data stream + * @param data_size Length of data + * @param start_value Do not count values that occur before this index + * @param end_value Do not count values that occur after this index + */ +__device__ size_t totalPlainEntriesSize(uint8_t const* data, + int data_size, + int start_value, + int end_value) +{ + int const t = threadIdx.x; + int pos = 0; + size_t total_len = 0; + + // This step is purely serial + if (!t) { + const uint8_t* cur = data; + int k = 0; + + while (pos < end_value && k < data_size) { + int len; + if (k + 4 <= data_size) { + len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24); + k += 4; + if (k + len > data_size) { len = 0; } + } else { + len = 0; + } + + k += len; + if (pos >= start_value) { total_len += len; } + pos++; + } + } + + return total_len; +} + +/** + * @brief Kernel for computing string page output size information. + * + * String columns need accurate data size information to preallocate memory in the column buffer to + * store the char data. This calls a kernel to calculate information needed by the string decoding + * kernel. On exit, the `str_bytes`, `num_nulls`, and `num_valids` fields of the PageInfo struct + * are updated. This call ignores non-string columns. + * + * @param pages All pages to be decoded + * @param chunks All chunks to be decoded + * @param min_rows crop all rows below min_row + * @param num_rows Maximum number of rows to read + * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels + * @tparam level_t Type used to store decoded repetition and definition levels + */ +template +__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes( + PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) +{ + __shared__ __align__(16) page_state_s state_g; + + // only count if it's a string column + if (not is_string_col(pages[blockIdx.x], chunks)) { return; } + + page_state_s* const s = &state_g; + int const page_idx = blockIdx.x; + int const t = threadIdx.x; + PageInfo* const pp = &pages[page_idx]; + + // reset str_bytes to 0 in case it's already been calculated + if (t == 0) { pp->str_bytes = 0; } + + // whether or not we have repetition levels (lists) + bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0; + + // the level stream decoders + __shared__ rle_run def_runs[run_buffer_size]; + __shared__ rle_run rep_runs[run_buffer_size]; + rle_stream decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs}, {rep_runs}}; + + // setup page info + if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, string_filter{chunks}, false)) { + return; + } + + if (!t) { + s->page.num_nulls = 0; + s->page.num_valids = 0; + s->page.str_bytes = 0; + } + __syncthreads(); + + bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition); + + // if we're skipping this page anyway, no need to count it + if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; } + + // find start/end value indices + auto const [start_value, end_value] = + page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders); + + // need to save num_nulls and num_valids calculated in page_bounds in this page + if (t == 0) { + pp->num_nulls = s->page.num_nulls; + pp->num_valids = s->page.num_valids; + } + + // now process string info in the range [start_value, end_value) + // set up for decoding strings...can be either plain or dictionary + auto const& col = s->col; + uint8_t const* data = s->data_start; + uint8_t const* const end = s->data_end; + uint8_t const* dict_base = nullptr; + int dict_size = 0; + size_t str_bytes = 0; + + switch (pp->encoding) { + case Encoding::PLAIN_DICTIONARY: + case Encoding::RLE_DICTIONARY: + // RLE-packed dictionary indices, first byte indicates index length in bits + if (col.str_dict_index) { + // String dictionary: use index + dict_base = reinterpret_cast(col.str_dict_index); + dict_size = col.page_info[0].num_input_values * sizeof(string_index_pair); + } else { + dict_base = col.page_info[0].page_data; // dictionary is always stored in the first page + dict_size = col.page_info[0].uncompressed_page_size; + } + + // FIXME: need to return an error condition...this won't actually do anything + if (s->dict_bits > 32 || !dict_base) { CUDF_UNREACHABLE("invalid dictionary bit size"); } + + str_bytes = totalDictEntriesSize( + data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value); + break; + case Encoding::PLAIN: + dict_size = static_cast(end - data); + str_bytes = is_bounds_pg ? totalPlainEntriesSize(data, dict_size, start_value, end_value) + : dict_size - sizeof(int) * (pp->num_input_values - pp->num_nulls); + break; + } + + if (t == 0) { + // TODO check for overflow + pp->str_bytes = str_bytes; + } +} + +/** + * @brief Kernel for computing the string column data stored in the pages + * + * This function will write the page data and the page data's validity to the + * output specified in the page's column chunk. + * + * This version uses a single warp to do the string copies. + * + * @param pages List of pages + * @param chunks List of column chunks + * @param min_row Row index to start reading at + * @param num_rows Maximum number of rows to read + * @tparam lvl_buf_size Size of the buffer used when decoding repetition and definition levels + * @tparam level_t Type used to store decoded repetition and definition levels + */ +template +__global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData( + PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) +{ + __shared__ __align__(16) page_state_s state_g; + __shared__ __align__(16) page_state_buffers_s state_buffers; + __shared__ __align__(4) size_type last_offset; + + page_state_s* const s = &state_g; + page_state_buffers_s* const sb = &state_buffers; + int const page_idx = blockIdx.x; + int const t = threadIdx.x; + [[maybe_unused]] null_count_back_copier _{s, t}; + + if (!setupLocalPageInfo( + s, &pages[page_idx], chunks, min_row, num_rows, string_filter{chunks}, true)) { + return; + } + + bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; + + // offsets are local to the page + if (t == 0) { last_offset = 0; } + __syncthreads(); + + int const out_thread0 = s->dict_base && s->dict_bits == 0 ? 32 : 64; + int const leaf_level_index = s->col.max_nesting_depth - 1; + PageNestingDecodeInfo* const nesting_info_base = s->nesting_info; + + __shared__ level_t rep[lvl_buf_size]; // circular buffer of repetition level values + __shared__ level_t def[lvl_buf_size]; // circular buffer of definition level values + + // skipped_leaf_values will always be 0 for flat hierarchies. + uint32_t skipped_leaf_values = s->page.skipped_leaf_values; + while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) { + int target_pos; + int src_pos = s->src_pos; + + if (t < out_thread0) { + target_pos = min(src_pos + 2 * (decode_block_size - out_thread0), + s->nz_count + (decode_block_size - out_thread0)); + } else { + target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0); + if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); } + } + __syncthreads(); + if (t < 32) { + // decode repetition and definition levels. + // - update validity vectors + // - updates offsets (for nested columns) + // - produces non-NULL value indices in s->nz_idx for subsequent decoding + gpuDecodeLevels(s, sb, target_pos, rep, def, t); + } else if (t < out_thread0) { + // skipped_leaf_values will always be 0 for flat hierarchies. + uint32_t src_target_pos = target_pos + skipped_leaf_values; + + // WARP1: Decode dictionary indices, booleans or string positions + if (s->dict_base) { + src_target_pos = gpuDecodeDictionaryIndices(s, sb, src_target_pos, t & 0x1f).first; + } else { + gpuInitStringDescriptors(s, sb, src_target_pos, t & 0x1f); + } + if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; } + } else { + int const me = t - out_thread0; + + // WARP1..WARP3: Decode values + src_pos += t - out_thread0; + + // the position in the output column/buffer + int dst_pos = sb->nz_idx[rolling_index(src_pos)]; + + // for the flat hierarchy case we will be reading from the beginning of the value stream, + // regardless of the value of first_row. so adjust our destination offset accordingly. + // example: + // - user has passed skip_rows = 2, so our first_row to output is 2 + // - the row values we get from nz_idx will be + // 0, 1, 2, 3, 4 .... + // - by shifting these values by first_row, the sequence becomes + // -1, -2, 0, 1, 2 ... + // - so we will end up ignoring the first two input rows, and input rows 2..n will + // get written to the output starting at position 0. + // + if (!has_repetition) { dst_pos -= s->first_row; } + + // need to do this before we branch on src_pos/dst_pos so we don't deadlock + // choose a character parallel string copy when the average string is longer than a warp + using cudf::detail::warp_size; + auto const use_char_ll = + s->page.num_valids > 0 && (s->page.str_bytes / s->page.num_valids) >= warp_size; + + if (me < warp_size) { + for (int i = 0; i < decode_block_size - out_thread0; i += warp_size) { + dst_pos = sb->nz_idx[rolling_index(src_pos + i)]; + if (!has_repetition) { dst_pos -= s->first_row; } + + auto [ptr, len] = src_pos + i < target_pos && dst_pos >= 0 + ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i) + : cuda::std::pair{nullptr, 0}; + + __shared__ cub::WarpScan::TempStorage temp_storage; + size_type offset; + cub::WarpScan(temp_storage).ExclusiveSum(len, offset); + offset += last_offset; + + if (use_char_ll) { + __shared__ __align__(8) uint8_t const* pointers[warp_size]; + __shared__ __align__(4) size_type offsets[warp_size]; + __shared__ __align__(4) int dsts[warp_size]; + __shared__ __align__(4) int lengths[warp_size]; + + offsets[me] = offset; + pointers[me] = reinterpret_cast(ptr); + dsts[me] = dst_pos; + lengths[me] = len; + __syncwarp(); + + for (int ss = 0; ss < warp_size && ss + i + s->src_pos < target_pos; ss++) { + if (dsts[ss] >= 0) { + auto offptr = + reinterpret_cast(nesting_info_base[leaf_level_index].data_out) + + dsts[ss]; + *offptr = lengths[ss]; + auto str_ptr = nesting_info_base[leaf_level_index].string_out + offsets[ss]; + ll_strcpy(str_ptr, pointers[ss], lengths[ss], me); + } + } + + } else { + if (src_pos + i < target_pos && dst_pos >= 0) { + auto offptr = + reinterpret_cast(nesting_info_base[leaf_level_index].data_out) + dst_pos; + *offptr = len; + auto str_ptr = nesting_info_base[leaf_level_index].string_out + offset; + memcpy(str_ptr, ptr, len); + } + __syncwarp(); + } + + // last thread in warp updates last_offset + if (me == warp_size - 1) { last_offset = offset + len; } + __syncwarp(); + } + } + + if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; } + } + __syncthreads(); + } + + // now turn array of lengths into offsets + int value_count = nesting_info_base[leaf_level_index].value_count; + + // if no repetition we haven't calculated start/end bounds and instead just skipped + // values until we reach first_row. account for that here. + if (!has_repetition) { value_count -= s->first_row; } + + auto const offptr = reinterpret_cast(nesting_info_base[leaf_level_index].data_out); + block_excl_sum(offptr, value_count, s->page.str_offset); +} + +} // anonymous namespace + +/** + * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes + */ +void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, + size_t min_row, + size_t num_rows, + int level_type_size, + rmm::cuda_stream_view stream) +{ + dim3 dim_block(preprocess_block_size, 1); + dim3 dim_grid(pages.size(), 1); // 1 threadblock per page + if (level_type_size == 1) { + gpuComputePageStringSizes + <<>>(pages.device_ptr(), chunks, min_row, num_rows); + } else { + gpuComputePageStringSizes + <<>>(pages.device_ptr(), chunks, min_row, num_rows); + } +} + +/** + * @copydoc cudf::io::parquet::gpu::DecodeStringPageData + */ +void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(pages.size() > 0, "There is no page to decode"); + + dim3 dim_block(decode_block_size, 1); + dim3 dim_grid(pages.size(), 1); // 1 threadblock per page + + if (level_type_size == 1) { + gpuDecodeStringPageData + <<>>(pages.device_ptr(), chunks, min_row, num_rows); + } else { + gpuDecodeStringPageData + <<>>(pages.device_ptr(), chunks, min_row, num_rows); + } +} + +} // namespace gpu +} // namespace parquet +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 7b93b527540..25d2885b7da 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -112,6 +112,7 @@ struct PageNestingDecodeInfo { int32_t valid_count; int32_t value_count; uint8_t* data_out; + uint8_t* string_out; bitmask_type* valid_map; }; @@ -162,9 +163,11 @@ struct PageInfo { // - In the case of a nested schema, you have to decode the repetition and definition // levels to extract actual column values int32_t num_input_values; - int32_t chunk_row; // starting row of this page relative to the start of the chunk - int32_t num_rows; // number of rows in this page - int32_t num_nulls; // number of null values (V2 header) + int32_t chunk_row; // starting row of this page relative to the start of the chunk + int32_t num_rows; // number of rows in this page + // the next two are calculated in gpuComputePageStringSizes + int32_t num_nulls; // number of null values (V2 header), but recalculated for string cols + int32_t num_valids; // number of non-null values, taking into account skip_rows/num_rows int32_t chunk_idx; // column chunk this page belongs to int32_t src_col_schema; // schema index of this column uint8_t flags; // PAGEINFO_FLAGS_XXX @@ -187,6 +190,7 @@ struct PageInfo { // for string columns only, the size of all the chars in the string for // this page. only valid/computed during the base preprocess pass int32_t str_bytes; + int32_t str_offset; // offset into string data for this page // nesting information (input/output) for each page. this array contains // input column nesting information, output column nesting information and @@ -241,6 +245,7 @@ struct ColumnChunkDesc { str_dict_index(nullptr), valid_map_base{nullptr}, column_data_base{nullptr}, + column_string_base{nullptr}, codec(codec_), converted_type(converted_type_), logical_type(logical_type_), @@ -270,6 +275,7 @@ struct ColumnChunkDesc { string_index_pair* str_dict_index; // index for string dictionary bitmask_type** valid_map_base; // base pointers of valid bit map for this column void** column_data_base; // base pointers of column data + void** column_string_base; // base pointers of column string data int8_t codec; // compressed codec enum int8_t converted_type; // converted type enum LogicalType logical_type; // logical type @@ -418,6 +424,15 @@ struct EncPage { compression_result* comp_res; //!< Ptr to compression result }; +/** + * @brief Test if the given column chunk is in a string column + */ +constexpr bool is_string_col(ColumnChunkDesc const& chunk) +{ + return (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4 and + chunk.converted_type != DECIMAL; +} + /** * @brief Launches kernel for parsing the page headers in the column chunks * @@ -472,6 +487,28 @@ void ComputePageSizes(cudf::detail::hostdevice_vector& pages, int level_type_size, rmm::cuda_stream_view stream); +/** + * @brief Compute string page output size information. + * + * String columns need accurate data size information to preallocate memory in the column buffer to + * store the char data. This calls a kernel to calculate information needed by the string decoding + * kernel. On exit, the `str_bytes`, `num_nulls`, `num_valids`, and `str_offset` fields of the + * PageInfo struct are updated. This call ignores non-string columns. + * + * @param[in,out] pages All pages to be decoded + * @param[in] chunks All chunks to be decoded + * @param[in] min_rows crop all rows below min_row + * @param[in] num_rows Maximum number of rows to read + * @param[in] level_type_size Size in bytes of the type for level decoding + * @param[in] stream CUDA stream to use, default 0 + */ +void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, + size_t min_row, + size_t num_rows, + int level_type_size, + rmm::cuda_stream_view stream); + /** * @brief Launches kernel for reading the column data stored in the pages * @@ -492,6 +529,26 @@ void DecodePageData(cudf::detail::hostdevice_vector& pages, int level_type_size, rmm::cuda_stream_view stream); +/** + * @brief Launches kernel for reading the string column data stored in the pages + * + * The page data will be written to the output pointed to in the page's + * associated column chunk. + * + * @param[in,out] pages All pages to be decoded + * @param[in] chunks All chunks to be decoded + * @param[in] num_rows Total number of rows to read + * @param[in] min_row Minimum number of rows to read + * @param[in] level_type_size Size in bytes of the type for level decoding + * @param[in] stream CUDA stream to use, default 0 + */ +void DecodeStringPageData(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + rmm::cuda_stream_view stream); + /** * @brief Launches kernel for initializing encoder row group fragments * diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 7a6e11537d6..0237bf820b0 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -17,11 +17,30 @@ #include "reader_impl.hpp" #include +#include #include namespace cudf::io::detail::parquet { +namespace { + +int constexpr NUM_DECODERS = 2; // how many decode kernels are there to run +int constexpr APPROX_NUM_THREADS = 4; // guestimate from DaveB +int constexpr STREAM_POOL_SIZE = NUM_DECODERS * APPROX_NUM_THREADS; + +auto& get_stream_pool() +{ + // TODO: creating this on the heap because there were issues with trying to call the + // stream pool destructor during cuda shutdown that lead to a segmentation fault in + // nvbench. this allocation is being deliberately leaked to avoid the above, but still + // results in non-fatal warnings when running nvbench in cuda-gdb. + static auto pool = new rmm::cuda_stream_pool{STREAM_POOL_SIZE}; + return *pool; +} + +} // namespace + void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) { auto& chunks = _file_itm_data.chunks; @@ -37,6 +56,29 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema); }); + // Check to see if there are any string columns present. If so, then we need to get size info + // for each string page. This size info will be used to pre-allocate memory for the column, + // allowing the page decoder to write string data directly to the column buffer, rather than + // doing a gather operation later on. + // TODO: This step is somewhat redundant if size info has already been calculated (nested schema, + // chunked reader). + auto const has_strings = std::any_of(chunks.begin(), chunks.end(), gpu::is_string_col); + + std::vector col_sizes(_input_columns.size(), 0L); + if (has_strings) { + gpu::ComputePageStringSizes( + pages, chunks, skip_rows, num_rows, _file_itm_data.level_type_size, _stream); + + col_sizes = calculate_page_string_offsets(); + + // check for overflow + if (std::any_of(col_sizes.begin(), col_sizes.end(), [](size_t sz) { + return sz > std::numeric_limits::max(); + })) { + CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); + } + } + // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i` @@ -44,6 +86,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) cudf::detail::hostdevice_vector(sum_max_depths, _stream); auto chunk_nested_data = cudf::detail::hostdevice_vector(sum_max_depths, _stream); auto chunk_offsets = std::vector(); + auto chunk_nested_str_data = + cudf::detail::hostdevice_vector(has_strings ? sum_max_depths : 0, _stream); // Update chunks with pointers to column data. for (size_t c = 0, page_count = 0, chunk_off = 0; c < chunks.size(); c++) { @@ -64,6 +108,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) auto data = chunk_nested_data.host_ptr(chunk_off); chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off); + auto str_data = has_strings ? chunk_nested_str_data.host_ptr(chunk_off) : nullptr; + chunks[c].column_string_base = + has_strings ? chunk_nested_str_data.device_ptr(chunk_off) : nullptr; + chunk_off += max_depth; // fill in the arrays on the host. there are some important considerations to @@ -106,6 +154,11 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) if (owning_schema == 0 || owning_schema == input_col.schema_idx) { valids[idx] = out_buf.null_mask(); data[idx] = out_buf.data(); + // only do string buffer for leaf + if (out_buf.string_size() == 0 && col_sizes[chunks[c].src_col_index] > 0) { + out_buf.create_string_data(col_sizes[chunks[c].src_col_index], _stream); + } + if (has_strings) { str_data[idx] = out_buf.string_data(); } out_buf.user_data |= static_cast(input_col.schema_idx) & PARQUET_COLUMN_BUFFER_SCHEMA_MASK; } else { @@ -121,8 +174,18 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) chunks.host_to_device_async(_stream); chunk_nested_valids.host_to_device_async(_stream); chunk_nested_data.host_to_device_async(_stream); + _stream.synchronize(); - gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, _stream); + auto stream1 = get_stream_pool().get_stream(); + gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream1); + if (has_strings) { + auto stream2 = get_stream_pool().get_stream(); + chunk_nested_str_data.host_to_device_async(stream2); + gpu::DecodeStringPageData( + pages, chunks, num_rows, skip_rows, _file_itm_data.level_type_size, stream2); + stream2.synchronize(); + } + stream1.synchronize(); pages.device_to_host_async(_stream); page_nesting.device_to_host_async(_stream); @@ -144,21 +207,28 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) auto& out_buf = (*cols)[input_col.nesting[l_idx]]; cols = &out_buf.children; - if (out_buf.type.id() != type_id::LIST || - (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED)) { - continue; + if (out_buf.type.id() == type_id::LIST && + (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED) == 0) { + CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column"); + auto const& child = (*cols)[input_col.nesting[l_idx + 1]]; + + // the final offset for a list at level N is the size of it's child + int const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; + CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), + &offset, + sizeof(offset), + cudaMemcpyDefault, + _stream.value())); + out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; + } else if (out_buf.type.id() == type_id::STRING) { + // need to cap off the string offsets column + size_type const sz = static_cast(col_sizes[idx]); + cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, + &sz, + sizeof(size_type), + cudaMemcpyDefault, + _stream.value()); } - CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column"); - auto& child = (*cols)[input_col.nesting[l_idx + 1]]; - - // the final offset for a list at level N is the size of it's child - int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), - &offset, - sizeof(offset), - cudaMemcpyDefault, - _stream.value())); - out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } } @@ -232,7 +302,7 @@ reader::impl::impl(std::size_t chunk_read_limit, // Don't need to do it if we read the file all at once. if (_chunk_read_limit > 0) { for (auto const& buff : _output_buffers) { - _output_buffers_template.emplace_back(column_buffer::empty_like(buff)); + _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff)); } } } @@ -350,7 +420,7 @@ table_with_metadata reader::impl::read_chunk() if (_chunk_read_limit > 0) { _output_buffers.resize(0); for (auto const& buff : _output_buffers_template) { - _output_buffers.emplace_back(column_buffer::empty_like(buff)); + _output_buffers.emplace_back(inline_column_buffer::empty_like(buff)); } } diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 4d627c41433..d25bf1e4c1f 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -24,8 +24,6 @@ #include "parquet_gpu.hpp" #include "reader_impl_helpers.hpp" -#include - #include #include #include @@ -38,6 +36,7 @@ #include namespace cudf::io::detail::parquet { + /** * @brief Implementation for Parquet reader */ @@ -221,6 +220,13 @@ class reader::impl { */ void allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds); + /** + * @brief Calculate per-page offsets for string data + * + * @return Vector of total string data sizes for each column + */ + std::vector calculate_page_string_offsets(); + /** * @brief Converts the page data and outputs to columns. * @@ -240,10 +246,10 @@ class reader::impl { std::vector _input_columns; // Buffers for generating output columns - std::vector _output_buffers; + std::vector _output_buffers; // Buffers copied from `_output_buffers` after construction for reuse - std::vector _output_buffers_template; + std::vector _output_buffers_template; // _output_buffers associated schema indices std::vector _output_column_schemas; diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 76ce7cb68bd..006b8d69aad 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -383,7 +383,9 @@ aggregate_reader_metadata::select_row_groups( return {rows_to_skip, rows_to_read, std::move(selection)}; } -std::tuple, std::vector, std::vector> +std::tuple, + std::vector, + std::vector> aggregate_reader_metadata::select_columns(std::optional> const& use_names, bool include_index, bool strings_to_categorical, @@ -400,17 +402,17 @@ aggregate_reader_metadata::select_columns(std::optional : -1; }; - std::vector output_columns; + std::vector output_columns; std::vector input_columns; std::vector nesting; // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is // not a child of "struct1" then the function will return false for "struct1" - std::function&, bool)> + std::function&, bool)> build_column = [&](column_name_info const* col_name_info, int schema_idx, - std::vector& out_col_array, + std::vector& out_col_array, bool has_list_parent) { if (schema_idx < 0) { return false; } auto const& schema_elem = get_schema(schema_idx); @@ -431,7 +433,7 @@ aggregate_reader_metadata::select_columns(std::optional : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const dtype = to_data_type(col_type, schema_elem); - column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL); + inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL); if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } // store the index of this element if inserted in out_col_array nesting.push_back(static_cast(out_col_array.size())); @@ -471,7 +473,7 @@ aggregate_reader_metadata::select_columns(std::optional to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const element_dtype = to_data_type(element_type, schema_elem); - column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL); + inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL); if (has_list_parent || col_type == type_id::LIST) { element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 748f0164244..0192dcd373b 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -188,7 +188,7 @@ class aggregate_reader_metadata { * indices */ [[nodiscard]] std:: - tuple, std::vector, std::vector> + tuple, std::vector, std::vector> select_columns(std::optional> const& use_names, bool include_index, bool strings_to_categorical, diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 53f4c5f2e18..efc221691f7 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -1269,8 +1269,12 @@ struct get_page_num_rows { __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; } }; -struct get_page_schema { - __device__ size_type operator()(gpu::PageInfo const& page) { return page.src_col_schema; } +struct get_page_column_index { + gpu::ColumnChunkDesc const* chunks; + __device__ size_type operator()(gpu::PageInfo const& page) + { + return chunks[page.chunk_idx].src_col_index; + } }; struct input_col_info { @@ -1493,6 +1497,43 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector& page } } +struct page_to_string_size { + gpu::PageInfo* pages; + gpu::ColumnChunkDesc const* chunks; + + __device__ size_t operator()(size_type page_idx) const + { + auto const page = pages[page_idx]; + auto const chunk = chunks[page.chunk_idx]; + + if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) { + return 0; + } + return pages[page_idx].str_bytes; + } +}; + +struct page_offset_output_iter { + gpu::PageInfo* p; + size_type const* index; + + using value_type = size_type; + using difference_type = size_type; + using pointer = size_type*; + using reference = size_type&; + using iterator_category = thrust::output_device_iterator_tag; + + __host__ __device__ page_offset_output_iter operator+(int i) + { + return page_offset_output_iter{p, index + i}; + } + + __host__ __device__ void operator++() { index++; } + + __device__ reference operator[](int i) { return p[index[i]].str_offset; } + __device__ reference operator*() { return p[*index].str_offset; } +}; + } // anonymous namespace void reader::impl::preprocess_pages(size_t skip_rows, @@ -1529,7 +1570,7 @@ void reader::impl::preprocess_pages(size_t skip_rows, pages.device_ptr(), pages.device_ptr() + pages.size(), page_keys.begin(), - get_page_schema{}); + get_page_column_index{chunks.device_ptr()}); thrust::sequence(rmm::exec_policy(_stream), page_index.begin(), page_index.end()); thrust::stable_sort_by_key(rmm::exec_policy(_stream), @@ -1648,16 +1689,16 @@ void reader::impl::preprocess_pages(size_t skip_rows, page_input, chunk_row_output_iter{pages.device_ptr()}); - // preserve page ordering data - _chunk_itm_data.page_keys = std::move(page_keys); - _chunk_itm_data.page_index = std::move(page_index); - // retrieve pages back pages.device_to_host_sync(_stream); // print_pages(pages, _stream); } + // preserve page ordering data for string decoder + _chunk_itm_data.page_keys = std::move(page_keys); + _chunk_itm_data.page_index = std::move(page_index); + // compute splits if necessary. otherwise return a single split representing // the whole file. _chunk_read_info = chunk_read_limit > 0 @@ -1800,4 +1841,44 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses } } +std::vector reader::impl::calculate_page_string_offsets() +{ + auto& chunks = _file_itm_data.chunks; + auto& pages = _file_itm_data.pages_info; + auto const& page_keys = _chunk_itm_data.page_keys; + auto const& page_index = _chunk_itm_data.page_index; + + std::vector col_sizes(_input_columns.size(), 0L); + rmm::device_uvector d_col_sizes(col_sizes.size(), _stream); + + // use page_index to fetch page string sizes in the proper order + auto val_iter = thrust::make_transform_iterator( + page_index.begin(), page_to_string_size{pages.device_ptr(), chunks.device_ptr()}); + + // do scan by key to calculate string offsets for each page + thrust::exclusive_scan_by_key(rmm::exec_policy(_stream), + page_keys.begin(), + page_keys.end(), + val_iter, + page_offset_output_iter{pages.device_ptr(), page_index.data()}); + + // now sum up page sizes + rmm::device_uvector reduce_keys(col_sizes.size(), _stream); + thrust::reduce_by_key(rmm::exec_policy(_stream), + page_keys.begin(), + page_keys.end(), + val_iter, + reduce_keys.begin(), + d_col_sizes.begin()); + + cudaMemcpyAsync(col_sizes.data(), + d_col_sizes.data(), + sizeof(size_t) * col_sizes.size(), + cudaMemcpyDeviceToHost, + _stream); + _stream.synchronize(); + + return col_sizes; +} + } // namespace cudf::io::detail::parquet diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index 18b17bc8611..9b8754d6318 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -26,43 +26,66 @@ #include -namespace cudf { -namespace io { -namespace detail { +namespace cudf::io::detail { -void column_buffer::create(size_type _size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream) { - size = _size; + CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column"); + // The contents of _strings will never be directly returned to the user. + // Due to the fact that make_strings_column copies the input data to + // produce its outputs, _strings is actually a temporary. As a result, we + // do not pass the provided mr to the call to + // make_zeroed_device_uvector_async here and instead let it use the + // default rmm memory resource. + _strings = std::make_unique>( + cudf::detail::make_zeroed_device_uvector_async( + size, stream, rmm::mr::get_current_device_resource())); +} - switch (type.id()) { - case type_id::STRING: - // The contents of _strings will never be directly returned to the user. - // Due to the fact that make_strings_column copies the input data to - // produce its outputs, _strings is actually a temporary. As a result, we - // do not pass the provided mr to the call to - // make_zeroed_device_uvector_async here and instead let it use the - // default rmm memory resource. - _strings = std::make_unique>( - cudf::detail::make_zeroed_device_uvector_async( - size, stream, rmm::mr::get_current_device_resource())); - break; +std::unique_ptr gather_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream) +{ + // make_strings_column allocates new memory, it does not simply move + // from the inputs, so we need to pass it the memory resource given to + // the buffer on construction so that the memory is allocated using the + // resource that the calling code expected. + return make_strings_column(*_strings, stream, _mr); +} - // list columns store a buffer of int32's as offsets to represent - // their individual rows - case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, mr); break; +void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column"); + // size + 1 for final offset. _string_data will be initialized later. + _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr); +} - // struct columns store no data themselves. just validity and children. - case type_id::STRUCT: break; +void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream) +{ + _string_data = rmm::device_buffer(num_bytes, stream, _mr); +} - default: _data = create_data(type, size, stream, mr); break; - } - if (is_nullable) { - _null_mask = - cudf::detail::create_null_mask(size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr); - } - this->mr = mr; +std::unique_ptr inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream) +{ + // no need for copies, just transfer ownership of the data_buffers to the columns + auto const state = mask_state::UNALLOCATED; + auto str_col = + _string_data.size() == 0 + ? make_empty_column(data_type{type_id::INT8}) + : std::make_unique(data_type{type_id::INT8}, + string_size(), + std::move(_string_data), + cudf::detail::create_null_mask(size, state, stream, _mr), + state_null_count(state, size), + std::vector>{}); + auto offsets_col = + std::make_unique(data_type{type_to_id()}, + size + 1, + std::move(_data), + cudf::detail::create_null_mask(size + 1, state, stream, _mr), + state_null_count(state, size + 1), + std::vector>{}); + + return make_strings_column( + size, std::move(offsets_col), std::move(str_col), null_count(), std::move(_null_mask)); } namespace { @@ -73,26 +96,55 @@ namespace { * @param buff The old output buffer * @param new_buff The new output buffer */ -void copy_buffer_data(column_buffer const& buff, column_buffer& new_buff) +template +void copy_buffer_data(string_policy const& buff, string_policy& new_buff) { new_buff.name = buff.name; new_buff.user_data = buff.user_data; for (auto const& child : buff.children) { - auto& new_child = new_buff.children.emplace_back(column_buffer(child.type, child.is_nullable)); + auto& new_child = new_buff.children.emplace_back(string_policy(child.type, child.is_nullable)); copy_buffer_data(child, new_child); } } } // namespace -column_buffer column_buffer::empty_like(column_buffer const& input) +template +void column_buffer_base::create(size_type _size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + size = _size; + _mr = mr; + + switch (type.id()) { + case type_id::STRING: static_cast(this)->allocate_strings_data(stream); break; + + // list columns store a buffer of int32's as offsets to represent + // their individual rows + case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, _mr); break; + + // struct columns store no data themselves. just validity and children. + case type_id::STRUCT: break; + + default: _data = create_data(type, size, stream, _mr); break; + } + if (is_nullable) { + _null_mask = cudf::detail::create_null_mask( + size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), _mr); + } +} + +template +string_policy column_buffer_base::empty_like(string_policy const& input) { - auto new_buff = column_buffer(input.type, input.is_nullable); + auto new_buff = string_policy(input.type, input.is_nullable); copy_buffer_data(input, new_buff); return new_buff; } -std::unique_ptr make_column(column_buffer& buffer, +template +std::unique_ptr make_column(column_buffer_base& buffer, column_name_info* schema_info, std::optional const& schema, rmm::cuda_stream_view stream) @@ -111,10 +163,10 @@ std::unique_ptr make_column(column_buffer& buffer, // from the inputs, so we need to pass it the memory resource given to // the buffer on construction so that the memory is allocated using the // resource that the calling code expected. - return make_strings_column(*buffer._strings, stream, buffer.mr); + return buffer.make_string_column(stream); } else { // convert to binary - auto const string_col = make_strings_column(*buffer._strings, stream, buffer.mr); + auto const string_col = buffer.make_string_column(stream); auto const num_rows = string_col->size(); auto const null_count = string_col->null_count(); auto col_content = string_col->release(); @@ -160,7 +212,7 @@ std::unique_ptr make_column(column_buffer& buffer, // make child column CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer"); - auto child = make_column(buffer.children[0], child_info, child_schema, stream); + auto child = make_column(buffer.children[0], child_info, child_schema, stream); // make the final list column (note : size is the # of offsets, so our actual # of rows is 1 // less) @@ -170,7 +222,7 @@ std::unique_ptr make_column(column_buffer& buffer, buffer._null_count, std::move(buffer._null_mask), stream, - buffer.mr); + buffer._mr); } break; case type_id::STRUCT: { @@ -190,7 +242,7 @@ std::unique_ptr make_column(column_buffer& buffer, : std::nullopt; output_children.emplace_back( - make_column(buffer.children[i], child_info, child_schema, stream)); + make_column(buffer.children[i], child_info, child_schema, stream)); } return make_structs_column(buffer.size, @@ -198,7 +250,7 @@ std::unique_ptr make_column(column_buffer& buffer, buffer._null_count, std::move(buffer._null_mask), stream, - buffer.mr); + buffer._mr); } break; default: { @@ -214,7 +266,8 @@ std::unique_ptr make_column(column_buffer& buffer, /** * @copydoc cudf::io::detail::empty_like */ -std::unique_ptr empty_like(column_buffer& buffer, +template +std::unique_ptr empty_like(column_buffer_base& buffer, column_name_info* schema_info, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -235,7 +288,8 @@ std::unique_ptr empty_like(column_buffer& buffer, // make child column CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer"); - auto child = empty_like(buffer.children[0], child_info, stream, mr); + auto child = + cudf::io::detail::empty_like(buffer.children[0], child_info, stream, mr); // make the final list column return make_lists_column( @@ -248,13 +302,14 @@ std::unique_ptr empty_like(column_buffer& buffer, std::transform(buffer.children.begin(), buffer.children.end(), std::back_inserter(output_children), - [&](column_buffer& col) { + [&](auto& col) { column_name_info* child_info = nullptr; if (schema_info != nullptr) { schema_info->children.push_back(column_name_info{""}); child_info = &schema_info->children.back(); } - return empty_like(col, child_info, stream, mr); + return cudf::io::detail::empty_like( + col, child_info, stream, mr); }); return make_structs_column( @@ -265,6 +320,34 @@ std::unique_ptr empty_like(column_buffer& buffer, } } -} // namespace detail -} // namespace io -} // namespace cudf +using pointer_type = gather_column_buffer; +using string_type = inline_column_buffer; + +using pointer_column_buffer = column_buffer_base; +using string_column_buffer = column_buffer_base; + +template std::unique_ptr make_column( + string_column_buffer& buffer, + column_name_info* schema_info, + std::optional const& schema, + rmm::cuda_stream_view stream); + +template std::unique_ptr make_column( + pointer_column_buffer& buffer, + column_name_info* schema_info, + std::optional const& schema, + rmm::cuda_stream_view stream); + +template std::unique_ptr empty_like(string_column_buffer& buffer, + column_name_info* schema_info, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +template std::unique_ptr empty_like(pointer_column_buffer& buffer, + column_name_info* schema_info, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +template class column_buffer_base; +template class column_buffer_base; +} // namespace cudf::io::detail diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index 1625a56d1c9..2ee7c17e480 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -62,42 +62,62 @@ inline rmm::device_buffer create_data(data_type type, using string_index_pair = thrust::pair; +// forward declare friend functions +template +class column_buffer_base; + /** - * @brief Class for holding device memory buffers to column data that eventually - * will be used to create a column. + * @brief Creates a column from an existing set of device memory buffers. + * + * @throws std::bad_alloc if device memory allocation fails + * + * @param buffer Column buffer descriptors + * @param schema_info Schema information for the column to write optionally. + * @param schema Optional schema used to control string to binary conversions. + * @param stream CUDA stream used for device memory operations and kernel launches. + * + * @return `std::unique_ptr` Column from the existing device data */ -struct column_buffer { - column_buffer() = default; - - // construct without a known size. call create() later to actually - // allocate memory - column_buffer(data_type _type, bool _is_nullable) : type(_type), is_nullable(_is_nullable) {} - - // construct with a known size. allocates memory - column_buffer(data_type _type, - size_type _size, - bool _is_nullable, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : column_buffer(_type, _is_nullable) +template +std::unique_ptr make_column(column_buffer_base& buffer, + column_name_info* schema_info, + std::optional const& schema, + rmm::cuda_stream_view stream); + +template +class column_buffer_base { + public: + column_buffer_base() = default; + + // construct without a known size. call create() later to actually allocate memory + column_buffer_base(data_type _type, bool _is_nullable) : type(_type), is_nullable(_is_nullable) {} + + column_buffer_base(data_type _type, + size_type _size, + bool _is_nullable, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : column_buffer_base(_type, _is_nullable) { - create(_size, stream, mr); } // move constructor - column_buffer(column_buffer&& col) = default; - column_buffer& operator=(column_buffer&& col) = default; + column_buffer_base(column_buffer_base&& col) = default; + column_buffer_base& operator=(column_buffer_base&& col) = default; // copy constructor - column_buffer(column_buffer const& col) = delete; - column_buffer& operator=(column_buffer const& col) = delete; + column_buffer_base(column_buffer_base const& col) = delete; + column_buffer_base& operator=(column_buffer_base const& col) = delete; // instantiate a column of known type with a specified size. Allows deferred creation for // preprocessing steps such as in the Parquet reader void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); - auto data() { return _strings ? _strings->data() : _data.data(); } - auto data_size() const { return _strings ? _strings->size() : _data.size(); } + // Create a new column_buffer that has empty data but with the same basic information as the + // input column, including same type, nullability, name, and user_data. + static string_policy empty_like(string_policy const& input); + + void set_null_mask(rmm::device_buffer&& mask) { _null_mask = std::move(mask); } template auto null_mask() @@ -105,43 +125,112 @@ struct column_buffer { return static_cast(_null_mask.data()); } auto null_mask_size() { return _null_mask.size(); } - auto& null_count() { return _null_count; } - // Create a new column_buffer that has empty data but with the same basic information as the - // input column, including same type, nullability, name, and user_data. - static column_buffer empty_like(column_buffer const& input); + auto data() { return static_cast(this)->data_impl(); } + auto data() const { return static_cast(this)->data_impl(); } + auto data_size() const { return static_cast(this)->data_size_impl(); } - std::unique_ptr> _strings; + std::unique_ptr make_string_column(rmm::cuda_stream_view stream) + { + return static_cast(this)->make_string_column_impl(stream); + } + + protected: rmm::device_buffer _data{}; rmm::device_buffer _null_mask{}; size_type _null_count{0}; + rmm::mr::device_memory_resource* _mr; + public: data_type type{type_id::EMPTY}; bool is_nullable{false}; size_type size{0}; - std::vector children; uint32_t user_data{0}; // arbitrary user data std::string name; - rmm::mr::device_memory_resource* mr; + std::vector children; + + friend std::unique_ptr make_column( + column_buffer_base& buffer, + column_name_info* schema_info, + std::optional const& schema, + rmm::cuda_stream_view stream); }; -/** - * @brief Creates a column from an existing set of device memory buffers. - * - * @throws std::bad_alloc if device memory allocation fails - * - * @param buffer Column buffer descriptors - * @param schema_info Schema information for the column to write optionally. - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return `std::unique_ptr` Column from the existing device data - */ -std::unique_ptr make_column(column_buffer& buffer, - column_name_info* schema_info, - std::optional const& schema, - rmm::cuda_stream_view stream); +// column buffer that uses a string_index_pair for strings data, requiring a gather step when +// creating a string column +class gather_column_buffer : public column_buffer_base { + public: + gather_column_buffer() = default; + + // construct without a known size. call create() later to actually allocate memory + gather_column_buffer(data_type _type, bool _is_nullable) + : column_buffer_base(_type, _is_nullable) + { + } + + gather_column_buffer(data_type _type, + size_type _size, + bool _is_nullable, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : column_buffer_base(_type, _size, _is_nullable, stream, mr) + { + create(_size, stream, mr); + } + + void allocate_strings_data(rmm::cuda_stream_view stream); + + void* data_impl() { return _strings ? _strings->data() : _data.data(); } + void const* data_impl() const { return _strings ? _strings->data() : _data.data(); } + size_t data_size_impl() const { return _strings ? _strings->size() : _data.size(); } + + std::unique_ptr make_string_column_impl(rmm::cuda_stream_view stream); + + public: + std::unique_ptr> _strings; +}; + +// column buffer that stores string data internally which can be passed directly when +// creating a string column +class inline_column_buffer : public column_buffer_base { + public: + inline_column_buffer() = default; + + // construct without a known size. call create() later to actually allocate memory + inline_column_buffer(data_type _type, bool _is_nullable) + : column_buffer_base(_type, _is_nullable) + { + } + + inline_column_buffer(data_type _type, + size_type _size, + bool _is_nullable, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : column_buffer_base(_type, _size, _is_nullable, stream, mr) + { + create(_size, stream, mr); + } + + void allocate_strings_data(rmm::cuda_stream_view stream); + + void* data_impl() { return _data.data(); } + void const* data_impl() const { return _data.data(); } + size_t data_size_impl() const { return _data.size(); } + std::unique_ptr make_string_column_impl(rmm::cuda_stream_view stream); + + void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream); + void* string_data() { return _string_data.data(); } + void const* string_data() const { return _string_data.data(); } + size_t string_size() const { return _string_data.size(); } + + private: + rmm::device_buffer _string_data{}; +}; + +using column_buffer = gather_column_buffer; /** * @brief Creates an equivalent empty column from an existing set of device memory buffers. @@ -158,7 +247,8 @@ std::unique_ptr make_column(column_buffer& buffer, * * @return `std::unique_ptr` Column from the existing device data */ -std::unique_ptr empty_like(column_buffer& buffer, +template +std::unique_ptr empty_like(column_buffer_base& buffer, column_name_info* schema_info, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);