From c23d28893db51b417abad60c83a08c1ab3de1398 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Thu, 4 Mar 2021 14:29:16 -0600 Subject: [PATCH 1/2] Fix issues with output partition sizes growing past the 32 bit limit. --- cpp/src/copying/contiguous_split.cu | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 3f8cd4014f1..06c368e35ce 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -94,7 +94,7 @@ struct dst_buf_info { int num_rows; // # of rows (which may be different from num_elements in the case of validity or // offset buffers) int src_row_index; // row index to start reading from from my associated source buffer - int dst_offset; // my offset into the per-partition allocation + size_t dst_offset; // my offset into the per-partition allocation int value_shift; // amount to shift values down by (for offset buffers) int bit_shift; // # of bits to shift right by (for validity buffers) size_type valid_count; @@ -142,12 +142,12 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, int num_rows, size_type* valid_count) { - src += (src_row_index * element_size); + src += (static_cast(src_row_index) * static_cast(element_size)); size_type thread_valid_count = 0; // handle misalignment. read 16 bytes in 4 byte reads. write in a single 16 byte store. - const size_t num_bytes = num_elements * element_size; + const size_t num_bytes = static_cast(num_elements) * static_cast(element_size); // how many bytes we're misaligned from 4-byte alignment const uint32_t ofs = reinterpret_cast(src) % 4; size_t pos = t * 16; @@ -191,7 +191,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, // alignment must be a multiple of 4. value shifting and bit shifting are mututally exclusive // and will never both be true at the same time. if (value_shift || bit_shift) { - int idx = (num_bytes - remainder) / 4; + size_t idx = (num_bytes - remainder) / 4; uint32_t v = remainder > 0 ? (reinterpret_cast(src)[idx] - value_shift) : 0; while (remainder) { uint32_t next = @@ -205,7 +205,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, } } else { while (remainder) { - int idx = num_bytes - remainder--; + size_t idx = num_bytes - remainder--; uint32_t val = reinterpret_cast(src)[idx]; if (valid_count) { thread_valid_count += __popc(val); } reinterpret_cast(dst)[idx] = val; @@ -224,8 +224,9 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, // we may have copied more bits than there are actual rows in the output. // so we need to subtract off the count of any bits that shouldn't have been // considered during the copy step. - int max_row = (num_bytes * 8); - int slack_bits = max_row > num_rows ? max_row - num_rows : 0; + size_t max_row = (num_bytes * 8); + size_t slack_bits = + max_row > static_cast(num_rows) ? max_row - static_cast(num_rows) : 0; auto slack_mask = set_most_significant_bits(slack_bits); if (slack_mask > 0) { uint32_t last_word = reinterpret_cast(dst + (num_bytes - 4))[0]; @@ -722,10 +723,10 @@ struct split_key_functor { */ struct dst_offset_output_iterator { dst_buf_info* c; - using value_type = int; - using difference_type = int; - using pointer = int*; - using reference = int&; + using value_type = size_t; + using difference_type = size_t; + using pointer = size_t*; + using reference = size_t&; using iterator_category = thrust::output_device_iterator_tag; dst_offset_output_iterator operator+ __host__ __device__(int i) @@ -929,7 +930,7 @@ std::vector contiguous_split(cudf::table_view const& input, return num_rows; }(); int const element_size = cudf::type_dispatcher(data_type{src_info.type}, size_of_helper{}); - size_t const bytes = num_elements * element_size; + size_t const bytes = static_cast(num_elements) * static_cast(element_size); return dst_buf_info{_round_up_safe(bytes, 64), num_elements, element_size, @@ -969,7 +970,7 @@ std::vector contiguous_split(cudf::table_view const& input, keys + num_bufs, values, dst_offset_output_iterator{d_dst_buf_info}, - 0); + static_cast(0)); } // DtoH buf sizes and col info back to the host From b505147f1bc6a183297bfae5b9e1352f45437ea4 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Thu, 4 Mar 2021 16:18:11 -0600 Subject: [PATCH 2/2] Changed size_t declarations to std::size_t. Added a few missing consts. --- cpp/src/copying/contiguous_split.cu | 123 ++++++++++++++-------------- 1 file changed, 62 insertions(+), 61 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 06c368e35ce..9a2f0f26f74 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -38,8 +38,8 @@ namespace { // align all column size allocations to this boundary so that all output column buffers // start at that alignment. -static constexpr size_t split_align = 64; -inline __device__ size_t _round_up_safe(size_t number_to_round, size_t modulus) +static constexpr std::size_t split_align = 64; +inline __device__ std::size_t _round_up_safe(std::size_t number_to_round, std::size_t modulus) { auto remainder = number_to_round % modulus; if (remainder == 0) { return number_to_round; } @@ -88,15 +88,15 @@ struct src_buf_info { * M partitions, then we have N*M destination buffers. */ struct dst_buf_info { - size_t buf_size; // total size of buffer, including padding - int num_elements; // # of elements to be copied - int element_size; // size of each element in bytes + std::size_t buf_size; // total size of buffer, including padding + int num_elements; // # of elements to be copied + int element_size; // size of each element in bytes int num_rows; // # of rows (which may be different from num_elements in the case of validity or // offset buffers) - int src_row_index; // row index to start reading from from my associated source buffer - size_t dst_offset; // my offset into the per-partition allocation - int value_shift; // amount to shift values down by (for offset buffers) - int bit_shift; // # of bits to shift right by (for validity buffers) + int src_row_index; // row index to start reading from from my associated source buffer + std::size_t dst_offset; // my offset into the per-partition allocation + int value_shift; // amount to shift values down by (for offset buffers) + int bit_shift; // # of bits to shift right by (for validity buffers) size_type valid_count; }; @@ -133,24 +133,24 @@ template __device__ void copy_buffer(uint8_t* __restrict__ dst, uint8_t* __restrict__ src, int t, - int num_elements, - int element_size, - int src_row_index, + std::size_t num_elements, + std::size_t element_size, + std::size_t src_row_index, uint32_t stride, int value_shift, int bit_shift, - int num_rows, + std::size_t num_rows, size_type* valid_count) { - src += (static_cast(src_row_index) * static_cast(element_size)); + src += (src_row_index * element_size); size_type thread_valid_count = 0; // handle misalignment. read 16 bytes in 4 byte reads. write in a single 16 byte store. - const size_t num_bytes = static_cast(num_elements) * static_cast(element_size); + std::size_t const num_bytes = num_elements * element_size; // how many bytes we're misaligned from 4-byte alignment - const uint32_t ofs = reinterpret_cast(src) % 4; - size_t pos = t * 16; + uint32_t const ofs = reinterpret_cast(src) % 4; + std::size_t pos = t * 16; stride *= 16; while (pos + 20 <= num_bytes) { // read from the nearest aligned address. @@ -175,12 +175,12 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, // copy trailing bytes if (t == 0) { - size_t remainder; + std::size_t remainder; if (num_bytes < 16) { remainder = num_bytes; } else { - size_t last_bracket = (num_bytes / 16) * 16; - remainder = num_bytes - last_bracket; + std::size_t const last_bracket = (num_bytes / 16) * 16; + remainder = num_bytes - last_bracket; if (remainder < 4) { // we had less than 20 bytes for the last possible 16 byte copy, so copy 16 + the extra remainder += 16; @@ -191,12 +191,12 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, // alignment must be a multiple of 4. value shifting and bit shifting are mututally exclusive // and will never both be true at the same time. if (value_shift || bit_shift) { - size_t idx = (num_bytes - remainder) / 4; - uint32_t v = remainder > 0 ? (reinterpret_cast(src)[idx] - value_shift) : 0; + std::size_t idx = (num_bytes - remainder) / 4; + uint32_t v = remainder > 0 ? (reinterpret_cast(src)[idx] - value_shift) : 0; while (remainder) { - uint32_t next = + uint32_t const next = remainder > 0 ? (reinterpret_cast(src)[idx + 1] - value_shift) : 0; - uint32_t val = (v >> bit_shift) | (next << (32 - bit_shift)); + uint32_t const val = (v >> bit_shift) | (next << (32 - bit_shift)); if (valid_count) { thread_valid_count += __popc(val); } reinterpret_cast(dst)[idx] = val; v = next; @@ -205,8 +205,8 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, } } else { while (remainder) { - size_t idx = num_bytes - remainder--; - uint32_t val = reinterpret_cast(src)[idx]; + std::size_t const idx = num_bytes - remainder--; + uint32_t const val = reinterpret_cast(src)[idx]; if (valid_count) { thread_valid_count += __popc(val); } reinterpret_cast(dst)[idx] = val; } @@ -224,12 +224,11 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, // we may have copied more bits than there are actual rows in the output. // so we need to subtract off the count of any bits that shouldn't have been // considered during the copy step. - size_t max_row = (num_bytes * 8); - size_t slack_bits = - max_row > static_cast(num_rows) ? max_row - static_cast(num_rows) : 0; - auto slack_mask = set_most_significant_bits(slack_bits); + std::size_t const max_row = (num_bytes * 8); + std::size_t const slack_bits = max_row > num_rows ? max_row - num_rows : 0; + auto const slack_mask = set_most_significant_bits(slack_bits); if (slack_mask > 0) { - uint32_t last_word = reinterpret_cast(dst + (num_bytes - 4))[0]; + uint32_t const last_word = reinterpret_cast(dst + (num_bytes - 4))[0]; block_valid_count -= __popc(last_word & slack_mask); } *valid_count = block_valid_count; @@ -261,9 +260,9 @@ __global__ void copy_partition(int num_src_bufs, uint8_t** dst_bufs, dst_buf_info* buf_info) { - int const partition_index = blockIdx.x / num_src_bufs; - int const src_buf_index = blockIdx.x % num_src_bufs; - size_t const buf_index = (partition_index * num_src_bufs) + src_buf_index; + int const partition_index = blockIdx.x / num_src_bufs; + int const src_buf_index = blockIdx.x % num_src_bufs; + std::size_t const buf_index = (partition_index * num_src_bufs) + src_buf_index; // copy, shifting offsets and validity bits as needed copy_buffer( @@ -323,7 +322,7 @@ bool is_offset_type(type_id id) { return (id == type_id::STRING or id == type_id * @returns Total offset stack size needed for this range of columns. */ template -size_t compute_offset_stack_size(InputIter begin, InputIter end, int offset_depth = 0) +std::size_t compute_offset_stack_size(InputIter begin, InputIter end, int offset_depth = 0) { return std::accumulate(begin, end, 0, [offset_depth](auto stack_size, column_view const& col) { auto const num_buffers = 1 + (col.nullable() ? 1 : 0); @@ -703,7 +702,7 @@ BufInfo build_output_columns(InputIter begin, */ struct buf_size_functor { dst_buf_info const* ci; - size_t operator() __device__(int index) { return static_cast(ci[index].buf_size); } + std::size_t operator() __device__(int index) { return ci[index].buf_size; } }; /** @@ -723,10 +722,10 @@ struct split_key_functor { */ struct dst_offset_output_iterator { dst_buf_info* c; - using value_type = size_t; - using difference_type = size_t; - using pointer = size_t*; - using reference = size_t&; + using value_type = std::size_t; + using difference_type = std::size_t; + using pointer = std::size_t*; + using reference = std::size_t&; using iterator_category = thrust::output_device_iterator_tag; dst_offset_output_iterator operator+ __host__ __device__(int i) @@ -779,7 +778,7 @@ std::vector contiguous_split(cudf::table_view const& input, } { size_type begin = 0; - for (size_t i = 0; i < splits.size(); i++) { + for (std::size_t i = 0; i < splits.size(); i++) { size_type end = splits[i]; CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative."); CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index."); @@ -788,8 +787,8 @@ std::vector contiguous_split(cudf::table_view const& input, } } - size_t const num_partitions = splits.size() + 1; - size_t const num_root_columns = input.num_columns(); + std::size_t const num_partitions = splits.size() + 1; + std::size_t const num_root_columns = input.num_columns(); // if inputs are empty, just return num_partitions empty tables if (input.column(0).size() == 0) { @@ -811,12 +810,12 @@ std::vector contiguous_split(cudf::table_view const& input, // compute # of source buffers (column data, validity, children), # of partitions // and total # of buffers size_type const num_src_bufs = count_src_bufs(input.begin(), input.end()); - size_t const num_bufs = num_src_bufs * num_partitions; + std::size_t const num_bufs = num_src_bufs * num_partitions; // packed block of memory 1. split indices and src_buf_info structs - size_t const indices_size = + std::size_t const indices_size = cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align); - size_t const src_buf_info_size = + std::size_t const src_buf_info_size = cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align); // host-side std::vector h_indices_and_source_info(indices_size + src_buf_info_size); @@ -826,7 +825,8 @@ std::vector contiguous_split(cudf::table_view const& input, // device-side // gpu-only : stack space needed for nested list offset calculation int const offset_stack_partition_size = compute_offset_stack_size(input.begin(), input.end()); - size_t const offset_stack_size = offset_stack_partition_size * num_partitions * sizeof(size_type); + std::size_t const offset_stack_size = + offset_stack_partition_size * num_partitions * sizeof(size_type); rmm::device_buffer d_indices_and_source_info(indices_size + src_buf_info_size + offset_stack_size, stream, rmm::mr::get_current_device_resource()); @@ -853,33 +853,33 @@ std::vector contiguous_split(cudf::table_view const& input, stream.value())); // packed block of memory 2. partition buffer sizes and dst_buf_info structs - size_t const buf_sizes_size = - cudf::util::round_up_safe(num_partitions * sizeof(size_t), split_align); - size_t const dst_buf_info_size = + std::size_t const buf_sizes_size = + cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align); + std::size_t const dst_buf_info_size = cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align); // host-side std::vector h_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size); - size_t* h_buf_sizes = reinterpret_cast(h_buf_sizes_and_dst_info.data()); + std::size_t* h_buf_sizes = reinterpret_cast(h_buf_sizes_and_dst_info.data()); dst_buf_info* h_dst_buf_info = reinterpret_cast(h_buf_sizes_and_dst_info.data() + buf_sizes_size); // device-side rmm::device_buffer d_buf_sizes_and_dst_info( buf_sizes_size + dst_buf_info_size, stream, rmm::mr::get_current_device_resource()); - size_t* d_buf_sizes = reinterpret_cast(d_buf_sizes_and_dst_info.data()); + std::size_t* d_buf_sizes = reinterpret_cast(d_buf_sizes_and_dst_info.data()); dst_buf_info* d_dst_buf_info = reinterpret_cast( static_cast(d_buf_sizes_and_dst_info.data()) + buf_sizes_size); // compute sizes of each column in each partition, including alignment. thrust::transform( rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_bufs), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_bufs), d_dst_buf_info, [num_src_bufs, d_indices, d_src_buf_info, d_offset_stack, - offset_stack_partition_size] __device__(size_t t) { + offset_stack_partition_size] __device__(std::size_t t) { int const split_index = t / num_src_bufs; int const src_buf_index = t % num_src_bufs; auto const& src_info = d_src_buf_info[src_buf_index]; @@ -930,7 +930,8 @@ std::vector contiguous_split(cudf::table_view const& input, return num_rows; }(); int const element_size = cudf::type_dispatcher(data_type{src_info.type}, size_of_helper{}); - size_t const bytes = static_cast(num_elements) * static_cast(element_size); + std::size_t const bytes = + static_cast(num_elements) * static_cast(element_size); return dst_buf_info{_round_up_safe(bytes, 64), num_elements, element_size, @@ -970,7 +971,7 @@ std::vector contiguous_split(cudf::table_view const& input, keys + num_bufs, values, dst_offset_output_iterator{d_dst_buf_info}, - static_cast(0)); + std::size_t{0}); } // DtoH buf sizes and col info back to the host @@ -987,15 +988,15 @@ std::vector contiguous_split(cudf::table_view const& input, std::transform(h_buf_sizes, h_buf_sizes + num_partitions, std::back_inserter(out_buffers), - [stream, mr](size_t bytes) { + [stream, mr](std::size_t bytes) { return rmm::device_buffer{bytes, stream, mr}; }); // packed block of memory 3. pointers to source and destination buffers (and stack space on the // gpu for offset computation) - size_t const src_bufs_size = + std::size_t const src_bufs_size = cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align); - size_t const dst_bufs_size = + std::size_t const dst_bufs_size = cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align); // host-side std::vector h_src_and_dst_buffers(src_bufs_size + dst_bufs_size); @@ -1040,7 +1041,7 @@ std::vector contiguous_split(cudf::table_view const& input, std::vector cols; cols.reserve(num_root_columns); auto cur_dst_buf_info = h_dst_buf_info; - for (size_t idx = 0; idx < num_partitions; idx++) { + for (std::size_t idx = 0; idx < num_partitions; idx++) { // traverse the buffers and build the columns. cur_dst_buf_info = build_output_columns( input.begin(), input.end(), cur_dst_buf_info, std::back_inserter(cols), h_dst_bufs[idx]);