From 36f56c97b94446f29fef5d2ddd8818275a28e406 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Wed, 20 Dec 2023 20:16:53 -0800 Subject: [PATCH] Add row conversion code from spark-rapids-jni (#14664) This temporarily moves the row conversion code from spark-rapids-jni into libcudf. It is necessary to have the row conversion code compiled in a static library to overcome a CCCL issue that triggers invalid memory access when calling to `thrust::in(ex)clusive_scan` (https://github.com/NVIDIA/spark-rapids-jni/issues/1567). In the future, when we have CCCL updated to fix the issue (1567), we may need to move the code back into spark-rapids-jni. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Vyas Ramasubramani (https://github.com/vyasr) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/14664 --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/row_conversion.hpp | 54 + .../src/transform}/row_conversion.cu | 1577 ++++++++++------- cpp/tests/CMakeLists.txt | 2 + cpp/tests/transform/row_conversion.cpp | 1021 +++++++++++ java/src/main/java/ai/rapids/cudf/Table.java | 142 -- java/src/main/native/CMakeLists.txt | 1 - java/src/main/native/src/TableJni.cpp | 85 - java/src/main/native/src/row_conversion.hpp | 52 - .../test/java/ai/rapids/cudf/TableTest.java | 67 - 10 files changed, 1990 insertions(+), 1012 deletions(-) create mode 100644 cpp/include/cudf/row_conversion.hpp rename {java/src/main/native/src => cpp/src/transform}/row_conversion.cu (58%) create mode 100644 cpp/tests/transform/row_conversion.cpp delete mode 100644 java/src/main/native/src/row_conversion.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4163275744e..a7c34ca489c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -628,6 +628,7 @@ add_library( src/transform/nans_to_nulls.cu src/transform/one_hot_encode.cu src/transform/row_bit_count.cu + src/transform/row_conversion.cu src/transform/transform.cpp src/transpose/transpose.cu src/unary/cast_ops.cu diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp new file mode 100644 index 00000000000..89453d49856 --- /dev/null +++ b/cpp/include/cudf/row_conversion.hpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace cudf { +//! @cond Doxygen_Suppress + +std::vector> convert_to_rows_fixed_width_optimized( + cudf::table_view const& tbl, + // TODO need something for validity + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +std::vector> convert_to_rows( + cudf::table_view const& tbl, + // TODO need something for validity + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr convert_from_rows_fixed_width_optimized( + cudf::lists_column_view const& input, + std::vector const& schema, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr convert_from_rows( + cudf::lists_column_view const& input, + std::vector const& schema, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +//! @endcond +} // namespace cudf diff --git a/java/src/main/native/src/row_conversion.cu b/cpp/src/transform/row_conversion.cu similarity index 58% rename from java/src/main/native/src/row_conversion.cu rename to cpp/src/transform/row_conversion.cu index fd7e7bc0b31..f3af90fb54f 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/cpp/src/transform/row_conversion.cu @@ -15,14 +15,15 @@ */ #include -#include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -42,15 +43,13 @@ #include #include -#include "row_conversion.hpp" - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 #define ASYNC_MEMCPY_SUPPORTED #endif #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED) #include -#endif // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED) +#endif // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED) #include #include @@ -61,6 +60,8 @@ #include #include +#include + namespace { constexpr auto JCUDF_ROW_ALIGNMENT = 8; @@ -68,30 +69,31 @@ constexpr auto JCUDF_ROW_ALIGNMENT = 8; constexpr auto MAX_BATCH_SIZE = std::numeric_limits::max(); // Number of rows each block processes in the two kernels. Tuned via nsight -constexpr auto NUM_STRING_ROWS_PER_BLOCK_TO_ROWS = 1024; +constexpr auto NUM_STRING_ROWS_PER_BLOCK_TO_ROWS = 1024; constexpr auto NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS = 64; -constexpr auto MIN_STRING_BLOCKS = 32; -constexpr auto MAX_STRING_BLOCKS = MAX_BATCH_SIZE; +constexpr auto MIN_STRING_BLOCKS = 32; +constexpr auto MAX_STRING_BLOCKS = MAX_BATCH_SIZE; constexpr auto NUM_WARPS_IN_BLOCK = 32; -} // anonymous namespace +} // anonymous namespace // needed to suppress warning about cuda::barrier #pragma nv_diag_suppress static_var_with_dynamic_init using namespace cudf; using detail::make_device_uvector_async; +using detail::make_device_uvector_sync; using rmm::device_uvector; #ifdef ASYNC_MEMCPY_SUPPORTED using cuda::aligned_size_t; #else -template using aligned_size_t = size_t; // Local stub for cuda::aligned_size_t. -#endif // ASYNC_MEMCPY_SUPPORTED +template +using aligned_size_t = size_t; // Local stub for cuda::aligned_size_t. +#endif // ASYNC_MEMCPY_SUPPORTED namespace cudf { -namespace jni { namespace detail { /* @@ -157,8 +159,9 @@ struct tile_info { int end_row; int batch_number; - __device__ inline size_type get_shared_row_size(size_type const *const col_offsets, - size_type const *const col_sizes) const { + __device__ inline size_type get_shared_row_size(size_type const* const col_offsets, + size_type const* const col_sizes) const + { // this calculation is invalid if there are holes in the data such as a variable-width column. // It is wrong in a safe way in that it will say this row size is larger than it should be, so // we are not losing data we are just not as efficient as we could be with shared memory. This @@ -180,9 +183,9 @@ struct tile_info { * */ struct row_batch { - size_type num_bytes; // number of bytes in this batch - size_type row_count; // number of rows in the batch - device_uvector row_offsets; // offsets column of output cudf column + size_type num_bytes; // number of bytes in this batch + size_type row_count; // number of rows in the batch + device_uvector row_offsets; // offsets column of output cudf column }; /** @@ -190,11 +193,11 @@ struct row_batch { * */ struct batch_data { - device_uvector batch_row_offsets; // offsets to each row in incoming data - device_uvector d_batch_row_boundaries; // row numbers for the start of each batch + device_uvector batch_row_offsets; // offsets to each row in incoming data + device_uvector d_batch_row_boundaries; // row numbers for the start of each batch std::vector - batch_row_boundaries; // row numbers for the start of each batch: 0, 1500, 2700 - std::vector row_batches; // information about each batch such as byte count + batch_row_boundaries; // row numbers for the start of each batch: 0, 1500, 2700 + std::vector row_batches; // information about each batch such as byte count }; /** @@ -207,8 +210,10 @@ struct batch_data { * offsets into the string column */ std::pair, rmm::device_uvector> -build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validity_size, - rmm::cuda_stream_view stream) { +build_string_row_offsets(table_view const& tbl, + size_type fixed_width_and_validity_size, + rmm::cuda_stream_view stream) +{ auto const num_rows = tbl.num_rows(); rmm::device_uvector d_row_sizes(num_rows, stream); thrust::uninitialized_fill(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), 0); @@ -216,41 +221,48 @@ build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validi auto d_offsets_iterators = [&]() { std::vector offsets_iterators; auto offsets_iter = thrust::make_transform_iterator( - tbl.begin(), [](auto const &col) -> strings_column_view::offset_iterator { - if (!is_fixed_width(col.type())) { - CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!"); - return strings_column_view(col).offsets_begin(); - } else { - return nullptr; - } - }); - std::copy_if(offsets_iter, offsets_iter + tbl.num_columns(), + tbl.begin(), [](auto const& col) -> strings_column_view::offset_iterator { + if (!is_fixed_width(col.type())) { + CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!"); + return strings_column_view(col).offsets_begin(); + } else { + return nullptr; + } + }); + std::copy_if(offsets_iter, + offsets_iter + tbl.num_columns(), std::back_inserter(offsets_iterators), - [](auto const &offset_ptr) { return offset_ptr != nullptr; }); - return make_device_uvector_async(offsets_iterators, stream, - rmm::mr::get_current_device_resource()); + [](auto const& offset_ptr) { return offset_ptr != nullptr; }); + return make_device_uvector_sync( + offsets_iterators, stream, rmm::mr::get_current_device_resource()); }(); auto const num_columns = static_cast(d_offsets_iterators.size()); - thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_columns * num_rows), - [d_offsets_iterators = d_offsets_iterators.data(), num_columns, num_rows, + [d_offsets_iterators = d_offsets_iterators.data(), + num_columns, + num_rows, d_row_sizes = d_row_sizes.data()] __device__(auto element_idx) { auto const row = element_idx % num_rows; auto const col = element_idx / num_rows; auto const val = - d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row]; + d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row]; atomicAdd(&d_row_sizes[row], val); }); // transform the row sizes to include fixed width size and alignment - thrust::transform( - rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), d_row_sizes.begin(), - cuda::proclaim_return_type([fixed_width_and_validity_size] __device__( - auto row_size) { - return util::round_up_unsafe(fixed_width_and_validity_size + row_size, JCUDF_ROW_ALIGNMENT); - })); + thrust::transform(rmm::exec_policy(stream), + d_row_sizes.begin(), + d_row_sizes.end(), + d_row_sizes.begin(), + cuda::proclaim_return_type( + [fixed_width_and_validity_size] __device__(auto row_size) { + return util::round_up_unsafe(fixed_width_and_validity_size + row_size, + JCUDF_ROW_ALIGNMENT); + })); return {std::move(d_row_sizes), std::move(d_offsets_iterators)}; } @@ -261,9 +273,10 @@ build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validi */ struct string_row_offset_functor { string_row_offset_functor(device_span d_row_offsets) - : d_row_offsets(d_row_offsets){}; + : d_row_offsets(d_row_offsets){}; - __device__ inline size_type operator()(int row_number, int) const { + __device__ inline size_type operator()(int row_number, int) const + { return d_row_offsets[row_number]; } @@ -276,9 +289,10 @@ struct string_row_offset_functor { */ struct fixed_width_row_offset_functor { fixed_width_row_offset_functor(size_type fixed_width_only_row_size) - : _fixed_width_only_row_size(fixed_width_only_row_size){}; + : _fixed_width_only_row_size(fixed_width_only_row_size){}; - __device__ inline size_type operator()(int row_number, int tile_row_start) const { + __device__ inline size_type operator()(int row_number, int tile_row_start) const + { return (row_number - tile_row_start) * _fixed_width_only_row_size; } @@ -300,11 +314,15 @@ struct fixed_width_row_offset_functor { * @param output_nm array of pointers to the output null masks * @param input_data pointing to the incoming row data */ -__global__ void -copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type num_columns, - const size_type row_size, const size_type *input_offset_in_row, - const size_type *num_bytes, int8_t **output_data, - bitmask_type **output_nm, const int8_t *input_data) { +__global__ void copy_from_rows_fixed_width_optimized(const size_type num_rows, + const size_type num_columns, + const size_type row_size, + const size_type* input_offset_in_row, + const size_type* num_bytes, + int8_t** output_data, + bitmask_type** output_nm, + const int8_t* input_data) +{ // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -319,30 +337,30 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n // are controlled by the x dimension (there are multiple blocks in the x // dimension). - size_type const rows_per_group = blockDim.x; - size_type const row_group_start = blockIdx.x; + size_type const rows_per_group = blockDim.x; + size_type const row_group_start = blockIdx.x; size_type const row_group_stride = gridDim.x; - size_type const row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + size_type const row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; extern __shared__ int8_t shared_data[]; // Because we are copying fixed width only data and we stride the rows // this thread will always start copying from shared data in the same place - int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; for (auto row_group_index = row_group_start; row_group_index < row_group_end; row_group_index += row_group_stride) { // Step 1: Copy the data into shared memory // We know row_size is always aligned with and a multiple of int64_t; - int64_t *long_shared = reinterpret_cast(shared_data); - int64_t const *long_input = reinterpret_cast(input_data); + int64_t* long_shared = reinterpret_cast(shared_data); + int64_t const* long_input = reinterpret_cast(input_data); - auto const shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); + auto const shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); auto const shared_output_stride = blockDim.x * blockDim.y; - auto const row_index_end = std::min(num_rows, ((row_group_index + 1) * rows_per_group)); - auto const num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - auto const shared_length = row_size * num_rows_in_group; + auto const row_index_end = std::min(num_rows, ((row_group_index + 1) * rows_per_group)); + auto const num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + auto const shared_length = row_size * num_rows_in_group; size_type const shared_output_end = shared_length / sizeof(int64_t); @@ -363,33 +381,33 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n // But we might not use all of the threads if the number of rows does not go // evenly into the thread count. We don't want those threads to exit yet // because we may need them to copy data in for the next row group. - uint32_t active_mask = __ballot_sync(0xffff'ffffu, row_index < num_rows); + uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); if (row_index < num_rows) { - auto const col_index_start = threadIdx.y; + auto const col_index_start = threadIdx.y; auto const col_index_stride = blockDim.y; for (auto col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { - auto const col_size = num_bytes[col_index]; - int8_t const *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); - int8_t *col_output = output_data[col_index]; + auto const col_size = num_bytes[col_index]; + int8_t const* col_tmp = &(row_tmp[input_offset_in_row[col_index]]); + int8_t* col_output = output_data[col_index]; switch (col_size) { case 1: { col_output[row_index] = *col_tmp; break; } case 2: { - int16_t *short_col_output = reinterpret_cast(col_output); - short_col_output[row_index] = *reinterpret_cast(col_tmp); + int16_t* short_col_output = reinterpret_cast(col_output); + short_col_output[row_index] = *reinterpret_cast(col_tmp); break; } case 4: { - int32_t *int_col_output = reinterpret_cast(col_output); - int_col_output[row_index] = *reinterpret_cast(col_tmp); + int32_t* int_col_output = reinterpret_cast(col_output); + int_col_output[row_index] = *reinterpret_cast(col_tmp); break; } case 8: { - int64_t *long_col_output = reinterpret_cast(col_output); - long_col_output[row_index] = *reinterpret_cast(col_tmp); + int64_t* long_col_output = reinterpret_cast(col_output); + long_col_output[row_index] = *reinterpret_cast(col_tmp); break; } default: { @@ -402,25 +420,29 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n } } - bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + bitmask_type* nm = output_nm[col_index]; + int8_t* valid_byte = &row_vld_tmp[col_index / 8]; size_type byte_bit_offset = col_index % 8; - int predicate = *valid_byte & (1 << byte_bit_offset); - uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { - nm[word_index(row_index)] = bitmask; - } - } // end column loop - } // end row copy + int predicate = *valid_byte & (1 << byte_bit_offset); + uint32_t bitmask = __ballot_sync(active_mask, predicate); + if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } + } // end column loop + } // end row copy // wait for the row_group to be totally copied before starting on the next row group __syncthreads(); } } -__global__ void copy_to_rows_fixed_width_optimized( - const size_type start_row, const size_type num_rows, const size_type num_columns, - const size_type row_size, const size_type *output_offset_in_row, const size_type *num_bytes, - const int8_t **input_data, const bitmask_type **input_nm, int8_t *output_data) { +__global__ void copy_to_rows_fixed_width_optimized(const size_type start_row, + const size_type num_rows, + const size_type num_columns, + const size_type row_size, + const size_type* output_offset_in_row, + const size_type* num_bytes, + const int8_t** input_data, + const bitmask_type** input_nm, + int8_t* output_data) +{ // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -437,18 +459,18 @@ __global__ void copy_to_rows_fixed_width_optimized( // are controlled by the x dimension (there are multiple blocks in the x // dimension). - size_type rows_per_group = blockDim.x; - size_type row_group_start = blockIdx.x; + size_type rows_per_group = blockDim.x; + size_type row_group_start = blockIdx.x; size_type row_group_stride = gridDim.x; - size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; extern __shared__ int8_t shared_data[]; // Because we are copying fixed width only data and we stride the rows // this thread will always start copying to shared data in the same place - int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t *row_vld_tmp = - &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t* row_vld_tmp = + &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; for (size_type row_group_index = row_group_start; row_group_index < row_group_end; row_group_index += row_group_stride) { @@ -459,31 +481,31 @@ __global__ void copy_to_rows_fixed_width_optimized( // evenly into the thread count. We don't want those threads to exit yet // because we may need them to copy data back out. if (row_index < (start_row + num_rows)) { - size_type col_index_start = threadIdx.y; + size_type col_index_start = threadIdx.y; size_type col_index_stride = blockDim.y; for (size_type col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { - size_type col_size = num_bytes[col_index]; - int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); - const int8_t *col_input = input_data[col_index]; + size_type col_size = num_bytes[col_index]; + int8_t* col_tmp = &(row_tmp[output_offset_in_row[col_index]]); + const int8_t* col_input = input_data[col_index]; switch (col_size) { case 1: { *col_tmp = col_input[row_index]; break; } case 2: { - const int16_t *short_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = short_col_input[row_index]; + const int16_t* short_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = short_col_input[row_index]; break; } case 4: { - const int32_t *int_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = int_col_input[row_index]; + const int32_t* int_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = int_col_input[row_index]; break; } case 8: { - const int64_t *long_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = long_col_input[row_index]; + const int64_t* long_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = long_col_input[row_index]; break; } default: { @@ -497,11 +519,11 @@ __global__ void copy_to_rows_fixed_width_optimized( } // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + int8_t* valid_byte = &row_vld_tmp[col_index / 8]; size_type byte_bit_offset = col_index % 8; - uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; - int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); - size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; + int32_t* valid_int = reinterpret_cast(valid_byte - fixup_bytes); + size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); // Now copy validity for the column if (input_nm[col_index]) { if (bit_is_set(input_nm[col_index], row_index)) { @@ -513,24 +535,22 @@ __global__ void copy_to_rows_fixed_width_optimized( // It is valid so just set the bit atomicOr_block(valid_int, 1 << int_bit_offset); } - } // end column loop - } // end row copy + } // end column loop + } // end row copy // wait for the row_group to be totally copied into shared memory __syncthreads(); // Step 2: Copy the data back out // We know row_size is always aligned with and a multiple of int64_t; - int64_t *long_shared = reinterpret_cast(shared_data); - int64_t *long_output = reinterpret_cast(output_data); + int64_t* long_shared = reinterpret_cast(shared_data); + int64_t* long_output = reinterpret_cast(output_data); - size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); + size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); size_type shared_input_stride = blockDim.x * blockDim.y; - size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { - row_index_end = num_rows; - } + size_type row_index_end = ((row_group_index + 1) * rows_per_group); + if (row_index_end > num_rows) { row_index_end = num_rows; } size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - size_type shared_length = row_size * num_rows_in_group; + size_type shared_length = row_size * num_rows_in_group; size_type shared_input_end = shared_length / sizeof(int64_t); @@ -549,7 +569,7 @@ __global__ void copy_to_rows_fixed_width_optimized( #define MEMCPY(dst, src, size, barrier) cuda::memcpy_async(dst, src, size, barrier) #else #define MEMCPY(dst, src, size, barrier) memcpy(dst, src, size) -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED /** * @brief copy data from cudf columns into JCUDF format, which is row-based @@ -568,12 +588,17 @@ __global__ void copy_to_rows_fixed_width_optimized( * */ template -__global__ void copy_to_rows(const size_type num_rows, const size_type num_columns, +__global__ void copy_to_rows(const size_type num_rows, + const size_type num_columns, const size_type shmem_used_per_tile, - device_span tile_infos, const int8_t **input_data, - const size_type *col_sizes, const size_type *col_offsets, - RowOffsetFunctor row_offsets, size_type const *batch_row_boundaries, - int8_t **output_data) { + device_span tile_infos, + const int8_t** input_data, + const size_type* col_sizes, + const size_type* col_offsets, + RowOffsetFunctor row_offsets, + size_type const* batch_row_boundaries, + int8_t** output_data) +{ // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -583,21 +608,19 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum // any calculation to do here, but it is important to note. auto const group = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(group); + auto const warp = cooperative_groups::tiled_partition(group); extern __shared__ int8_t shared_data[]; #ifdef ASYNC_MEMCPY_SUPPORTED __shared__ cuda::barrier tile_barrier; - if (group.thread_rank() == 0) { - init(&tile_barrier, group.size()); - } + if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED - auto const tile = tile_infos[blockIdx.x]; - auto const num_tile_cols = tile.num_cols(); - auto const num_tile_rows = tile.num_rows(); - auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes); + auto const tile = tile_infos[blockIdx.x]; + auto const num_tile_cols = tile.num_cols(); + auto const num_tile_rows = tile.num_rows(); + auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes); auto const starting_column_offset = col_offsets[tile.start_col]; // to do the copy we need to do n column copies followed by m element copies OR we have to do m @@ -612,12 +635,11 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum // works on a row for (int relative_col = warp.meta_group_rank(); relative_col < num_tile_cols; relative_col += warp.meta_group_size()) { - - auto const absolute_col = relative_col + tile.start_col; - auto const col_size = col_sizes[absolute_col]; - auto const col_offset = col_offsets[absolute_col]; + auto const absolute_col = relative_col + tile.start_col; + auto const col_size = col_sizes[absolute_col]; + auto const col_offset = col_offsets[absolute_col]; auto const relative_col_offset = col_offset - starting_column_offset; - auto const col_ptr = input_data[absolute_col]; + auto const col_ptr = input_data[absolute_col]; if (col_ptr == nullptr) { // variable-width data column @@ -626,7 +648,6 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum for (int relative_row = warp.thread_rank(); relative_row < num_tile_rows; relative_row += warp.size()) { - if (relative_row >= num_tile_rows) { // out of bounds continue; @@ -634,23 +655,23 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum auto const absolute_row = relative_row + tile.start_row; auto const shared_offset = relative_row * tile_row_size + relative_col_offset; - auto const input_src = col_ptr + col_size * absolute_row; + auto const input_src = col_ptr + col_size * absolute_row; // copy the element from global memory switch (col_size) { case 2: { - const int16_t *short_col_input = reinterpret_cast(input_src); - *reinterpret_cast(&shared_data[shared_offset]) = *short_col_input; + const int16_t* short_col_input = reinterpret_cast(input_src); + *reinterpret_cast(&shared_data[shared_offset]) = *short_col_input; break; } case 4: { - const int32_t *int_col_input = reinterpret_cast(input_src); - *reinterpret_cast(&shared_data[shared_offset]) = *int_col_input; + const int32_t* int_col_input = reinterpret_cast(input_src); + *reinterpret_cast(&shared_data[shared_offset]) = *int_col_input; break; } case 8: { - const int64_t *long_col_input = reinterpret_cast(input_src); - *reinterpret_cast(&shared_data[shared_offset]) = *long_col_input; + const int64_t* long_col_input = reinterpret_cast(input_src); + *reinterpret_cast(&shared_data[shared_offset]) = *long_col_input; break; } case 1: shared_data[shared_offset] = *input_src; break; @@ -691,7 +712,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum tile_barrier.arrive_and_wait(); #else group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -710,58 +731,60 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum * */ template -__global__ void -copy_validity_to_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_tile, RowOffsetFunctor row_offsets, - size_type const *batch_row_boundaries, int8_t **output_data, - const size_type validity_offset, device_span tile_infos, - const bitmask_type **input_nm) { +__global__ void copy_validity_to_rows(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_tile, + RowOffsetFunctor row_offsets, + size_type const* batch_row_boundaries, + int8_t** output_data, + const size_type validity_offset, + device_span tile_infos, + const bitmask_type** input_nm) +{ extern __shared__ int8_t shared_data[]; // each thread of warp reads a single int32 of validity - so we read 128 bytes then ballot_sync // the bits and write the result to shmem after we fill shared mem memcpy it out in a blob. auto const group = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(group); + auto const warp = cooperative_groups::tiled_partition(group); #ifdef ASYNC_MEMCPY_SUPPORTED // Initialize cuda barriers for each tile. __shared__ cuda::barrier shared_tile_barrier; - if (group.thread_rank() == 0) { - init(&shared_tile_barrier, group.size()); - } + if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED - auto tile = tile_infos[blockIdx.x]; + auto tile = tile_infos[blockIdx.x]; auto const num_tile_cols = tile.num_cols(); auto const num_tile_rows = tile.num_rows(); auto const threads_per_warp = warp.size(); - auto const rows_per_read = cudf::detail::size_in_bits(); + auto const rows_per_read = cudf::detail::size_in_bits(); auto const num_sections_x = util::div_rounding_up_unsafe(num_tile_cols, threads_per_warp); auto const num_sections_y = util::div_rounding_up_unsafe(num_tile_rows, rows_per_read); auto const validity_data_row_length = util::round_up_unsafe( - util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT); + util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT); auto const total_sections = num_sections_x * num_sections_y; // the tile is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp.meta_group_rank(); my_section_idx < total_sections; my_section_idx += warp.meta_group_size()) { // convert to rows and cols - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; - auto const relative_col = section_x * threads_per_warp + warp.thread_rank(); - auto const relative_row = section_y * rows_per_read; - auto const absolute_col = relative_col + tile.start_col; - auto const absolute_row = relative_row + tile.start_row; - auto const participating = absolute_col < num_columns && absolute_row < num_rows; - auto const participation_mask = __ballot_sync(0xFFFF'FFFFu, participating); + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; + auto const relative_col = section_x * threads_per_warp + warp.thread_rank(); + auto const relative_row = section_y * rows_per_read; + auto const absolute_col = relative_col + tile.start_col; + auto const absolute_row = relative_row + tile.start_row; + auto const participating = absolute_col < num_columns && absolute_row < num_rows; + auto const participation_mask = __ballot_sync(0xFFFFFFFF, participating); if (participating) { - auto my_data = input_nm[absolute_col] != nullptr ? - input_nm[absolute_col][word_index(absolute_row)] : - std::numeric_limits::max(); + auto my_data = input_nm[absolute_col] != nullptr + ? input_nm[absolute_col][word_index(absolute_row)] + : std::numeric_limits::max(); // every thread that is participating in the warp has 4 bytes, but it's column-based data and // we need it in row-based. So we shuffle the bits around with ballot_sync to make the bytes @@ -771,19 +794,19 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns, auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); // lead thread in each warp writes data auto const validity_write_offset = - validity_data_row_length * (relative_row + i) + (relative_col / CHAR_BIT); + validity_data_row_length * (relative_row + i) + (relative_col / CHAR_BIT); if (warp.thread_rank() == 0) { - *reinterpret_cast(&shared_data[validity_write_offset]) = validity_data; + *reinterpret_cast(&shared_data[validity_write_offset]) = validity_data; } } } } auto const output_data_base = - output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT; + output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT; // each warp copies a row at a time - auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT); + auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT); auto const row_batch_start = tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; // make sure entire tile has finished copy @@ -809,7 +832,7 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns, shared_tile_barrier.arrive_and_wait(); #else group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -828,42 +851,46 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns, * */ template -__global__ void copy_strings_to_rows(size_type const num_rows, size_type const num_variable_columns, - int8_t const **variable_input_data, - size_type const *variable_col_output_offsets, - size_type const **variable_col_offsets, - size_type fixed_width_row_size, RowOffsetFunctor row_offsets, - size_type const batch_row_offset, int8_t *output_data) { +__global__ void copy_strings_to_rows(size_type const num_rows, + size_type const num_variable_columns, + int8_t const** variable_input_data, + size_type const* variable_col_output_offsets, + size_type const** variable_col_offsets, + size_type fixed_width_row_size, + RowOffsetFunctor row_offsets, + size_type const batch_row_offset, + int8_t* output_data) +{ // Each block will take a group of rows controlled by NUM_STRING_ROWS_PER_BLOCK_TO_ROWS. Each warp // will copy a row at a time. The base thread will first go through column data and fill out // offset/length information for the column. Then all threads of the warp will participate in the // memcpy of the string data. auto const my_block = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(my_block); + auto const warp = cooperative_groups::tiled_partition(my_block); #ifdef ASYNC_MEMCPY_SUPPORTED cuda::barrier block_barrier; #endif auto const start_row = - blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset; + blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset; auto const end_row = - std::min(num_rows, static_cast(start_row + NUM_STRING_ROWS_PER_BLOCK_TO_ROWS)); + std::min(num_rows, static_cast(start_row + NUM_STRING_ROWS_PER_BLOCK_TO_ROWS)); for (int row = start_row; row < end_row; row += warp.meta_group_size()) { - auto offset = fixed_width_row_size; // initial offset to variable-width data + auto offset = fixed_width_row_size; // initial offset to variable-width data auto const base_row_offset = row_offsets(row, 0); for (int col = 0; col < num_variable_columns; ++col) { auto const string_start_offset = variable_col_offsets[col][row]; - auto const string_length = variable_col_offsets[col][row + 1] - string_start_offset; + auto const string_length = variable_col_offsets[col][row + 1] - string_start_offset; if (warp.thread_rank() == 0) { // write the offset/length to column - uint32_t *output_dest = reinterpret_cast( - &output_data[base_row_offset + variable_col_output_offsets[col]]); + uint32_t* output_dest = reinterpret_cast( + &output_data[base_row_offset + variable_col_output_offsets[col]]); output_dest[0] = offset; output_dest[1] = string_length; } auto string_output_dest = &output_data[base_row_offset + offset]; - auto string_output_src = &variable_input_data[col][string_start_offset]; + auto string_output_src = &variable_input_data[col][string_start_offset]; warp.sync(); #ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, string_output_dest, string_output_src, string_length, block_barrier); @@ -893,11 +920,17 @@ __global__ void copy_strings_to_rows(size_type const num_rows, size_type const n * */ template -__global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_tile, RowOffsetFunctor row_offsets, - size_type const *batch_row_boundaries, int8_t **output_data, - const size_type *col_sizes, const size_type *col_offsets, - device_span tile_infos, const int8_t *input_data) { +__global__ void copy_from_rows(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_tile, + RowOffsetFunctor row_offsets, + size_type const* batch_row_boundaries, + int8_t** output_data, + const size_type* col_sizes, + const size_type* col_offsets, + device_span tile_infos, + const int8_t* input_data) +{ // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -910,31 +943,30 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col // memory for each of the tiles that we work on auto const group = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(group); + auto const warp = cooperative_groups::tiled_partition(group); extern __shared__ int8_t shared[]; #ifdef ASYNC_MEMCPY_SUPPORTED // Initialize cuda barriers for each tile. __shared__ cuda::barrier tile_barrier; - if (group.thread_rank() == 0) { - init(&tile_barrier, group.size()); - } + if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED { - auto const fetch_tile = tile_infos[blockIdx.x]; + auto const fetch_tile = tile_infos[blockIdx.x]; auto const fetch_tile_start_row = fetch_tile.start_row; - auto const starting_col_offset = col_offsets[fetch_tile.start_col]; - auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes); + auto const starting_col_offset = col_offsets[fetch_tile.start_col]; + auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes); auto const row_batch_start = - fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number]; + fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number]; for (int absolute_row = warp.meta_group_rank() + fetch_tile.start_row; - absolute_row <= fetch_tile.end_row; absolute_row += warp.meta_group_size()) { + absolute_row <= fetch_tile.end_row; + absolute_row += warp.meta_group_size()) { warp.sync(); auto shared_offset = (absolute_row - fetch_tile_start_row) * fetch_tile_row_size; - auto dst = &shared[shared_offset]; + auto dst = &shared[shared_offset]; auto src = &input_data[row_offsets(absolute_row, row_batch_start) + starting_col_offset]; // copy the data #ifdef ASYNC_MEMCPY_SUPPORTED @@ -948,9 +980,9 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col } { - auto const tile = tile_infos[blockIdx.x]; - auto const rows_in_tile = tile.num_rows(); - auto const cols_in_tile = tile.num_cols(); + auto const tile = tile_infos[blockIdx.x]; + auto const rows_in_tile = tile.num_rows(); + auto const cols_in_tile = tile.num_cols(); auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes); #ifdef ASYNC_MEMCPY_SUPPORTED @@ -958,7 +990,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col tile_barrier.arrive_and_wait(); #else group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED // Now we copy from shared memory to final destination. The data is laid out in rows in shared // memory, so the reads for a column will be "vertical". Because of this and the different sizes @@ -967,8 +999,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col // than rows, we do a global index instead of a double for loop with col/row. for (int relative_row = warp.thread_rank(); relative_row < rows_in_tile; relative_row += warp.size()) { - - auto const absolute_row = relative_row + tile.start_row; + auto const absolute_row = relative_row + tile.start_row; auto const shared_memory_row_offset = tile_row_size * relative_row; for (int relative_col = warp.meta_group_rank(); relative_col < cols_in_tile; @@ -976,11 +1007,11 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col auto const absolute_col = relative_col + tile.start_col; auto const shared_memory_offset = - col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset; + col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset; auto const column_size = col_sizes[absolute_col]; - int8_t *shmem_src = &shared[shared_memory_offset]; - int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; + int8_t* shmem_src = &shared[shared_memory_offset]; + int8_t* dst = &output_data[absolute_col][absolute_row * column_size]; MEMCPY(dst, shmem_src, column_size, tile_barrier); } @@ -992,7 +1023,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col tile_barrier.arrive_and_wait(); #else group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -1011,12 +1042,16 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col * */ template -__global__ void -copy_validity_from_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_tile, RowOffsetFunctor row_offsets, - size_type const *batch_row_boundaries, bitmask_type **output_nm, - const size_type validity_offset, device_span tile_infos, - const int8_t *input_data) { +__global__ void copy_validity_from_rows(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_tile, + RowOffsetFunctor row_offsets, + size_type const* batch_row_boundaries, + bitmask_type** output_nm, + const size_type validity_offset, + device_span tile_infos, + const int8_t* input_data) +{ extern __shared__ int8_t shared[]; using cudf::detail::warp_size; @@ -1036,46 +1071,44 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, // __ballot_sync, representing 32 rows of that column. auto const group = cooperative_groups::this_thread_block(); - auto const warp = cooperative_groups::tiled_partition(group); + auto const warp = cooperative_groups::tiled_partition(group); #ifdef ASYNC_MEMCPY_SUPPORTED // Initialize cuda barriers for each tile. __shared__ cuda::barrier shared_tile_barrier; - if (group.thread_rank() == 0) { - init(&shared_tile_barrier, group.size()); - } + if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); } group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED - auto const tile = tile_infos[blockIdx.x]; + auto const tile = tile_infos[blockIdx.x]; auto const tile_start_col = tile.start_col; auto const tile_start_row = tile.start_row; - auto const num_tile_cols = tile.num_cols(); - auto const num_tile_rows = tile.num_rows(); + auto const num_tile_cols = tile.num_cols(); + auto const num_tile_rows = tile.num_rows(); auto const threads_per_warp = warp.size(); - auto const cols_per_read = CHAR_BIT; + auto const cols_per_read = CHAR_BIT; - auto const rows_per_read = static_cast(threads_per_warp); - auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, cols_per_read); - auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, rows_per_read); - auto const validity_data_col_length = num_sections_y * 4; // words to bytes - auto const total_sections = num_sections_x * num_sections_y; + auto const rows_per_read = static_cast(threads_per_warp); + auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, cols_per_read); + auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, rows_per_read); + auto const validity_data_col_length = num_sections_y * 4; // words to bytes + auto const total_sections = num_sections_x * num_sections_y; // the tile is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp.meta_group_rank(); my_section_idx < total_sections; my_section_idx += warp.meta_group_size()) { // convert section to row and col - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; auto const relative_col = section_x * cols_per_read; auto const relative_row = section_y * rows_per_read + warp.thread_rank(); auto const absolute_col = relative_col + tile_start_col; auto const absolute_row = relative_row + tile_start_row; auto const row_batch_start = - tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; + tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; - auto const participation_mask = __ballot_sync(0xFFFF'FFFFu, absolute_row < num_rows); + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); if (absolute_row < num_rows) { auto const my_byte = input_data[row_offsets(absolute_row, row_batch_start) + validity_offset + @@ -1090,8 +1123,8 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, // lead thread in each warp writes data if (warp.thread_rank() == 0) { auto const validity_write_offset = - validity_data_col_length * (relative_col + i) + relative_row / cols_per_read; - *reinterpret_cast(&shared[validity_write_offset]) = validity_data; + validity_data_col_length * (relative_col + i) + relative_row / cols_per_read; + *reinterpret_cast(&shared[validity_write_offset]) = validity_data; } } } @@ -1106,13 +1139,13 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, for (int relative_col = warp.meta_group_rank(); relative_col < num_tile_cols; relative_col += warp.meta_group_size()) { auto const absolute_col = relative_col + tile_start_col; - auto dst = output_nm[absolute_col] + word_index(tile_start_row); + auto dst = output_nm[absolute_col] + word_index(tile_start_row); auto const src = - reinterpret_cast(&shared[validity_data_col_length * relative_col]); + reinterpret_cast(&shared[validity_data_col_length * relative_col]); #ifdef ASYNC_MEMCPY_SUPPORTED - cuda::memcpy_async(warp, dst, src, aligned_size_t<4>(validity_data_col_length), - shared_tile_barrier); + cuda::memcpy_async( + warp, dst, src, aligned_size_t<4>(validity_data_col_length), shared_tile_barrier); #else for (int b = warp.thread_rank(); b < col_words; b += warp.size()) { dst[b] = src[b]; @@ -1125,7 +1158,7 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, shared_tile_barrier.arrive_and_wait(); #else group.sync(); -#endif // ASYNC_MEMCPY_SUPPORTED +#endif // ASYNC_MEMCPY_SUPPORTED } /** @@ -1142,38 +1175,42 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, * @param num_string_columns number of string columns in the table */ template -__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets, int32_t **string_row_offsets, - int32_t **string_lengths, size_type **string_column_offsets, - char **string_col_data, int8_t const *row_data, +__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets, + int32_t** string_row_offsets, + int32_t** string_lengths, + size_type** string_column_offsets, + char** string_col_data, + int8_t const* row_data, size_type const num_rows, - size_type const num_string_columns) { + size_type const num_string_columns) +{ // Each warp takes a tile, which is a single column and up to ROWS_PER_BLOCK rows. A tile will not // wrap around the bottom of the table. The warp will copy the strings for each row in the tile. // Traversing in row-major order to coalesce the offsets and size reads. auto my_block = cooperative_groups::this_thread_block(); - auto warp = cooperative_groups::tiled_partition(my_block); + auto warp = cooperative_groups::tiled_partition(my_block); #ifdef ASYNC_MEMCPY_SUPPORTED cuda::barrier block_barrier; #endif // workaround for not being able to take a reference to a constexpr host variable auto const ROWS_PER_BLOCK = NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS; - auto const tiles_per_col = util::div_rounding_up_unsafe(num_rows, ROWS_PER_BLOCK); - auto const starting_tile = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank(); - auto const num_tiles = tiles_per_col * num_string_columns; - auto const tile_stride = warp.meta_group_size() * gridDim.x; + auto const tiles_per_col = util::div_rounding_up_unsafe(num_rows, ROWS_PER_BLOCK); + auto const starting_tile = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank(); + auto const num_tiles = tiles_per_col * num_string_columns; + auto const tile_stride = warp.meta_group_size() * gridDim.x; // Each warp will copy strings in its tile. This is handled by all the threads of a warp passing // the same parameters to async_memcpy and all threads in the warp participating in the copy. for (auto my_tile = starting_tile; my_tile < num_tiles; my_tile += tile_stride) { auto const starting_row = (my_tile % tiles_per_col) * ROWS_PER_BLOCK; - auto const col = my_tile / tiles_per_col; - auto const str_len = string_lengths[col]; - auto const str_row_off = string_row_offsets[col]; - auto const str_col_off = string_column_offsets[col]; - auto str_col_data = string_col_data[col]; + auto const col = my_tile / tiles_per_col; + auto const str_len = string_lengths[col]; + auto const str_row_off = string_row_offsets[col]; + auto const str_col_off = string_column_offsets[col]; + auto str_col_data = string_col_data[col]; for (int row = starting_row; row < starting_row + ROWS_PER_BLOCK && row < num_rows; ++row) { auto const src = &row_data[row_offsets(row, 0) + str_row_off[row]]; - auto dst = &str_col_data[str_col_off[row]]; + auto dst = &str_col_data[str_col_off[row]]; #ifdef ASYNC_MEMCPY_SUPPORTED cuda::memcpy_async(warp, dst, src, str_len[row], block_barrier); @@ -1196,8 +1233,12 @@ __global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets, int32_t **s * @param [out] threads the size of the threads for the kernel * @return the size in bytes of shared memory needed for each block. */ -static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_type num_rows, - const size_type size_per_row, dim3 &blocks, dim3 &threads) { +static int calc_fixed_width_kernel_dims(const size_type num_columns, + const size_type num_rows, + const size_type size_per_row, + dim3& blocks, + dim3& threads) +{ // We have found speed degrades when a thread handles more than 4 columns. // Each block is 2 dimensional. The y dimension indicates the columns. // We limit this to 32 threads in the y dimension so we can still @@ -1207,7 +1248,7 @@ static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_ // in the x dimension because we use atomic operations at the block // level when writing validity data out to main memory, and that would // need to change if we split a word of validity data between blocks. - int const y_block_size = min(util::div_rounding_up_safe(num_columns, 4), 32); + int const y_block_size = min(util::div_rounding_up_safe(num_columns, 4), 32); int const x_possible_block_size = 1024 / y_block_size; // 48KB is the default setting for shared memory per block according to the cuda tutorials // If someone configures the GPU to only have 16 KB this might not work. @@ -1230,9 +1271,9 @@ static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_ // to try and future proof this a bit. int const num_blocks = std::clamp((num_rows + block_size - 1) / block_size, 1, 10240); - blocks.x = num_blocks; - blocks.y = 1; - blocks.z = 1; + blocks.x = num_blocks; + blocks.y = 1; + blocks.z = 1; threads.x = block_size; threads.y = y_block_size; threads.z = 1; @@ -1246,12 +1287,19 @@ static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_ * into this function are common between runs and should be calculated once. */ static std::unique_ptr fixed_width_convert_to_rows( - const size_type start_row, const size_type num_rows, const size_type num_columns, - const size_type size_per_row, rmm::device_uvector &column_start, - rmm::device_uvector &column_size, rmm::device_uvector &input_data, - rmm::device_uvector &input_nm, const scalar &zero, - const scalar &scalar_size_per_row, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { + const size_type start_row, + const size_type num_rows, + const size_type num_columns, + const size_type size_per_row, + rmm::device_uvector& column_start, + rmm::device_uvector& column_size, + rmm::device_uvector& input_data, + rmm::device_uvector& input_nm, + const scalar& zero, + const scalar& scalar_size_per_row, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ int64_t const total_allocation = size_per_row * num_rows; // We made a mistake in the split somehow CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), @@ -1259,28 +1307,43 @@ static std::unique_ptr fixed_width_convert_to_rows( // Allocate and set the offsets row for the byte array std::unique_ptr offsets = - cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr); + cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr); - std::unique_ptr data = - make_numeric_column(data_type(type_id::INT8), static_cast(total_allocation), - mask_state::UNALLOCATED, stream, mr); + std::unique_ptr data = make_numeric_column(data_type(type_id::INT8), + static_cast(total_allocation), + mask_state::UNALLOCATED, + stream, + mr); dim3 blocks; dim3 threads; int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); copy_to_rows_fixed_width_optimized<<>>( - start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(), - input_data.data(), input_nm.data(), data->mutable_view().data()); - - return make_lists_column(num_rows, std::move(offsets), std::move(data), 0, - rmm::device_buffer{0, stream, mr}, stream, mr); + start_row, + num_rows, + num_columns, + size_per_row, + column_start.data(), + column_size.data(), + input_data.data(), + input_nm.data(), + data->mutable_view().data()); + + return make_lists_column(num_rows, + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, cudf::get_default_stream(), mr}, + stream, + mr); } -static inline bool are_all_fixed_width(std::vector const &schema) { - return std::all_of(schema.begin(), schema.end(), - [](const data_type &t) { return is_fixed_width(t); }); +static inline bool are_all_fixed_width(std::vector const& schema) +{ + return std::all_of( + schema.begin(), schema.end(), [](const data_type& t) { return is_fixed_width(t); }); } /** @@ -1291,9 +1354,10 @@ static inline bool are_all_fixed_width(std::vector const &schema) { * @param [out] column_size the size in bytes of the data for each columns in the row. * @return the size in bytes each row needs. */ -static inline int32_t compute_fixed_width_layout(std::vector const &schema, - std::vector &column_start, - std::vector &column_size) { +static inline int32_t compute_fixed_width_layout(std::vector const& schema, + std::vector& column_start, + std::vector& column_size) +{ // We guarantee that the start of each column is 64-bit aligned so anything can go // there, but to make the code simple we will still do an alignment for it. int32_t at_offset = 0; @@ -1301,7 +1365,7 @@ static inline int32_t compute_fixed_width_layout(std::vector const &s size_type s = size_of(*col); column_size.emplace_back(s); std::size_t allocation_needed = s; - std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types + std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types at_offset = util::round_up_unsafe(at_offset, static_cast(alignment_needed)); column_start.emplace_back(at_offset); at_offset += allocation_needed; @@ -1311,7 +1375,7 @@ static inline int32_t compute_fixed_width_layout(std::vector const &s // Eventually we can think about nullable vs not nullable, but for now we will just always add // it in int32_t const validity_bytes_needed = - util::div_rounding_up_safe(schema.size(), CHAR_BIT); + util::div_rounding_up_safe(schema.size(), CHAR_BIT); // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned @@ -1327,8 +1391,8 @@ struct column_info_s { std::vector column_sizes; std::vector variable_width_column_starts; - column_info_s &operator=(column_info_s const &other) = delete; - column_info_s &operator=(column_info_s &&other) = delete; + column_info_s& operator=(column_info_s const& other) = delete; + column_info_s& operator=(column_info_s&& other) = delete; }; /** @@ -1342,7 +1406,8 @@ struct column_info_s { * @return size of the fixed_width data portion of a row. */ template -column_info_s compute_column_information(iterator begin, iterator end) { +column_info_s compute_column_information(iterator begin, iterator end) +{ size_type size_per_row = 0; std::vector column_starts; std::vector column_sizes; @@ -1360,10 +1425,8 @@ column_info_s compute_column_information(iterator begin, iterator end) { // align size for this type - They are the same for fixed width types and 4 bytes for variable // width length/offset combos size_type const alignment_needed = compound_type ? __alignof(uint32_t) : col_size; - size_per_row = util::round_up_unsafe(size_per_row, alignment_needed); - if (compound_type) { - variable_width_column_starts.push_back(size_per_row); - } + size_per_row = util::round_up_unsafe(size_per_row, alignment_needed); + if (compound_type) { variable_width_column_starts.push_back(size_per_row); } column_starts.push_back(size_per_row); column_sizes.push_back(col_size); size_per_row += col_size; @@ -1375,9 +1438,11 @@ column_info_s compute_column_information(iterator begin, iterator end) { // validity is byte-aligned in the JCUDF format size_per_row += - util::div_rounding_up_safe(static_cast(std::distance(begin, end)), CHAR_BIT); + util::div_rounding_up_safe(static_cast(std::distance(begin, end)), CHAR_BIT); - return {size_per_row, std::move(column_starts), std::move(column_sizes), + return {size_per_row, + std::move(column_starts), + std::move(column_sizes), std::move(variable_width_column_starts)}; } @@ -1390,34 +1455,35 @@ column_info_s compute_column_information(iterator begin, iterator end) { * @param row_batches batched row information for multiple output locations * @return vector of `tile_info` structs for validity data */ -std::vector -build_validity_tile_infos(size_type const &num_columns, size_type const &num_rows, - size_type const &shmem_limit_per_tile, - std::vector const &row_batches) { +std::vector build_validity_tile_infos(size_type const& num_columns, + size_type const& num_rows, + size_type const& shmem_limit_per_tile, + std::vector const& row_batches) +{ auto const desired_rows_and_columns = static_cast(sqrt(shmem_limit_per_tile)); - auto const column_stride = util::round_up_unsafe( - [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, build a single tile for table width and ship it off - return num_columns; - } else { - return util::round_down_safe(desired_rows_and_columns, CHAR_BIT); - } - }(), - JCUDF_ROW_ALIGNMENT); + auto const column_stride = util::round_up_unsafe( + [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, build a single tile for table width and ship it off + return num_columns; + } else { + return util::round_down_safe(desired_rows_and_columns, CHAR_BIT); + } + }(), + JCUDF_ROW_ALIGNMENT); // we fit as much as we can given the column stride note that an element in the table takes just 1 // bit, but a row with a single element still takes 8 bytes! - auto const bytes_per_row = util::round_up_safe( - util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT); + auto const bytes_per_row = + util::round_up_safe(util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT); auto const row_stride = - std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64)); + std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64)); std::vector validity_tile_infos; validity_tile_infos.reserve(num_columns / column_stride * num_rows / row_stride); for (int col = 0; col < num_columns; col += column_stride) { int current_tile_row_batch = 0; - int rows_left_in_batch = row_batches[current_tile_row_batch].row_count; - int row = 0; + int rows_left_in_batch = row_batches[current_tile_row_batch].row_count; + int row = 0; while (row < num_rows) { if (rows_left_in_batch == 0) { current_tile_row_batch++; @@ -1425,8 +1491,11 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row } int const tile_height = std::min(row_stride, rows_left_in_batch); validity_tile_infos.emplace_back( - detail::tile_info{col, row, std::min(col + column_stride - 1, num_columns - 1), - row + tile_height - 1, current_tile_row_batch}); + detail::tile_info{col, + row, + std::min(col + column_stride - 1, num_columns - 1), + row + tile_height - 1, + current_tile_row_batch}); row += tile_height; rows_left_in_batch -= tile_height; } @@ -1441,11 +1510,15 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row * * @tparam RowSize iterator that returns the size of a specific row */ -template struct row_size_functor { +template +struct row_size_functor { row_size_functor(size_type row_end, RowSize row_sizes, size_type last_row_end) - : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end) {} + : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end) + { + } - __device__ inline uint64_t operator()(int i) const { + __device__ inline uint64_t operator()(int i) const + { return i >= _row_end ? 0 : _row_sizes[i + _last_row_end]; } @@ -1467,11 +1540,15 @@ template struct row_size_functor { * device_uvector of row offsets */ template -batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_width, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { +batch_data build_batches(size_type num_rows, + RowSize row_sizes, + bool all_fixed_width, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows); auto const num_batches = static_cast( - util::div_rounding_up_safe(total_size, static_cast(MAX_BATCH_SIZE))); + util::div_rounding_up_safe(total_size, static_cast(MAX_BATCH_SIZE))); auto const num_offsets = num_batches + 1; std::vector row_batches; std::vector batch_row_boundaries; @@ -1482,8 +1559,9 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w batch_row_boundaries.push_back(0); size_type last_row_end = 0; device_uvector cumulative_row_sizes(num_rows, stream); - thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, - cumulative_row_sizes.begin()); + + thrust::inclusive_scan( + rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, cumulative_row_sizes.begin()); // This needs to be split this into 2 gig batches. Care must be taken to avoid a batch larger than // 2 gigs. Imagine a table with 900 meg rows. The batches should occur every 2 rows, but if a @@ -1497,22 +1575,22 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w while (last_row_end < num_rows) { auto offset_row_sizes = thrust::make_transform_iterator( - cumulative_row_sizes.begin(), - cuda::proclaim_return_type( - [last_row_end, cumulative_row_sizes = cumulative_row_sizes.data()] __device__(auto i) { - return i - cumulative_row_sizes[last_row_end]; - })); + cumulative_row_sizes.begin(), + cuda::proclaim_return_type( + [last_row_end, cumulative_row_sizes = cumulative_row_sizes.data()] __device__(auto i) { + return i - cumulative_row_sizes[last_row_end]; + })); auto search_start = offset_row_sizes + last_row_end; - auto search_end = offset_row_sizes + num_rows; + auto search_end = offset_row_sizes + num_rows; // find the next MAX_BATCH_SIZE boundary auto const lb = - thrust::lower_bound(rmm::exec_policy(stream), search_start, search_end, MAX_BATCH_SIZE); + thrust::lower_bound(rmm::exec_policy(stream), search_start, search_end, MAX_BATCH_SIZE); size_type const batch_size = lb - search_start; - size_type const row_end = lb == search_end ? - batch_size + last_row_end : - last_row_end + util::round_down_safe(batch_size, 32); + size_type const row_end = lb == search_end + ? batch_size + last_row_end + : last_row_end + util::round_down_safe(batch_size, 32); // build offset list for each row in this batch auto const num_rows_in_batch = row_end - last_row_end; @@ -1522,10 +1600,12 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w device_uvector output_batch_row_offsets(num_entries, stream, mr); auto row_size_iter_bounded = cudf::detail::make_counting_transform_iterator( - 0, row_size_functor(row_end, row_sizes, last_row_end)); + 0, row_size_functor(row_end, row_sizes, last_row_end)); - thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded, - row_size_iter_bounded + num_entries, output_batch_row_offsets.begin()); + thrust::exclusive_scan(rmm::exec_policy(stream), + row_size_iter_bounded, + row_size_iter_bounded + num_entries, + output_batch_row_offsets.begin()); auto const batch_bytes = output_batch_row_offsets.element(num_rows_in_batch, stream); @@ -1533,8 +1613,10 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w // needs to be individually allocated, but the kernel needs a contiguous array of offsets or // more global lookups are necessary. if (!all_fixed_width) { - cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(), - num_rows_in_batch * sizeof(size_type), cudaMemcpyDefault); + cudaMemcpy(batch_row_offsets.data() + last_row_end, + output_batch_row_offsets.data(), + num_rows_in_batch * sizeof(size_type), + cudaMemcpyDeviceToDevice); } batch_row_boundaries.push_back(row_end); @@ -1543,10 +1625,11 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w last_row_end = row_end; } - return {std::move(batch_row_offsets), - make_device_uvector_async(batch_row_boundaries, stream, - rmm::mr::get_current_device_resource()), - std::move(batch_row_boundaries), std::move(row_batches)}; + return { + std::move(batch_row_offsets), + make_device_uvector_async(batch_row_boundaries, stream, rmm::mr::get_current_device_resource()), + std::move(batch_row_boundaries), + std::move(row_batches)}; } /** @@ -1557,20 +1640,25 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w * @param stream stream to use * @return number of tiles necessary */ -int compute_tile_counts(device_span const &batch_row_boundaries, - int desired_tile_height, rmm::cuda_stream_view stream) { +int compute_tile_counts(device_span const& batch_row_boundaries, + int desired_tile_height, + rmm::cuda_stream_view stream) +{ size_type const num_batches = batch_row_boundaries.size() - 1; device_uvector num_tiles(num_batches, stream); auto iter = thrust::make_counting_iterator(0); thrust::transform( - rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(), - cuda::proclaim_return_type( - [desired_tile_height, batch_row_boundaries = batch_row_boundaries.data()] __device__( - auto batch_index) -> size_type { - return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] - - batch_row_boundaries[batch_index], - desired_tile_height); - })); + rmm::exec_policy(stream), + iter, + iter + num_batches, + num_tiles.begin(), + cuda::proclaim_return_type( + [desired_tile_height, batch_row_boundaries = batch_row_boundaries.data()] __device__( + auto batch_index) -> size_type { + return util::div_rounding_up_unsafe( + batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index], + desired_tile_height); + })); return thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end()); } @@ -1586,64 +1674,78 @@ int compute_tile_counts(device_span const &batch_row_boundaries * @param stream stream to use * @return number of tiles created */ -size_type -build_tiles(device_span tiles, - device_uvector const &batch_row_boundaries, // comes from build_batches - int column_start, int column_end, int desired_tile_height, int total_number_of_rows, - rmm::cuda_stream_view stream) { +size_type build_tiles( + device_span tiles, + device_uvector const& batch_row_boundaries, // comes from build_batches + int column_start, + int column_end, + int desired_tile_height, + int total_number_of_rows, + rmm::cuda_stream_view stream) +{ size_type const num_batches = batch_row_boundaries.size() - 1; device_uvector num_tiles(num_batches, stream); auto iter = thrust::make_counting_iterator(0); thrust::transform( - rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(), - cuda::proclaim_return_type( - [desired_tile_height, batch_row_boundaries = batch_row_boundaries.data()] __device__( - auto batch_index) -> size_type { - return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] - - batch_row_boundaries[batch_index], - desired_tile_height); - })); + rmm::exec_policy(stream), + iter, + iter + num_batches, + num_tiles.begin(), + cuda::proclaim_return_type( + [desired_tile_height, batch_row_boundaries = batch_row_boundaries.data()] __device__( + auto batch_index) -> size_type { + return util::div_rounding_up_unsafe( + batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index], + desired_tile_height); + })); size_type const total_tiles = - thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end()); + thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end()); device_uvector tile_starts(num_batches + 1, stream); auto tile_iter = cudf::detail::make_counting_transform_iterator( - 0, cuda::proclaim_return_type( - [num_tiles = num_tiles.data(), num_batches] __device__(auto i) { - return (i < num_batches) ? num_tiles[i] : 0; - })); - thrust::exclusive_scan(rmm::exec_policy(stream), tile_iter, tile_iter + num_batches + 1, - tile_starts.begin()); // in tiles + 0, + cuda::proclaim_return_type( + [num_tiles = num_tiles.data(), num_batches] __device__(auto i) { + return (i < num_batches) ? num_tiles[i] : 0; + })); + thrust::exclusive_scan(rmm::exec_policy(stream), + tile_iter, + tile_iter + num_batches + 1, + tile_starts.begin()); // in tiles thrust::transform( - rmm::exec_policy(stream), iter, iter + total_tiles, tiles.begin(), - cuda::proclaim_return_type( - [=, tile_starts = tile_starts.data(), - batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) { - // what batch this tile falls in - auto const batch_index_iter = thrust::upper_bound( - thrust::seq, tile_starts, tile_starts + num_batches, tile_index); - auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1; - // local index within the tile - int const local_tile_index = tile_index - tile_starts[batch_index]; - // the start row for this batch. - int const batch_row_start = batch_row_boundaries[batch_index]; - // the start row for this tile - int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height); - // the end row for this tile - int const max_row = - std::min(total_number_of_rows - 1, - batch_index + 1 > num_batches ? - std::numeric_limits::max() : - static_cast(batch_row_boundaries[batch_index + 1]) - 1); - int const tile_row_end = std::min( - batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row); - - // stuff the tile - return tile_info{column_start, tile_row_start, column_end, tile_row_end, - static_cast(batch_index)}; - })); + rmm::exec_policy(stream), + iter, + iter + total_tiles, + tiles.begin(), + cuda::proclaim_return_type( + [ =, + tile_starts = tile_starts.data(), + batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) { + // what batch this tile falls in + auto const batch_index_iter = + thrust::upper_bound(thrust::seq, tile_starts, tile_starts + num_batches, tile_index); + auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1; + // local index within the tile + int const local_tile_index = tile_index - tile_starts[batch_index]; + // the start row for this batch. + int const batch_row_start = batch_row_boundaries[batch_index]; + // the start row for this tile + int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height); + // the end row for this tile + int const max_row = + std::min(total_number_of_rows - 1, + batch_index + 1 > num_batches + ? std::numeric_limits::max() + : static_cast(batch_row_boundaries[batch_index + 1]) - 1); + int const tile_row_end = + std::min(batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row); + + // stuff the tile + return tile_info{ + column_start, tile_row_start, column_end, tile_row_end, static_cast(batch_index)}; + })); return total_tiles; } @@ -1661,13 +1763,16 @@ build_tiles(device_span tiles, * @param f callback function called when building a tile */ template -void determine_tiles(std::vector const &column_sizes, - std::vector const &column_starts, - size_type const first_row_batch_size, size_type const total_number_of_rows, - size_type const &shmem_limit_per_tile, TileCallback f) { +void determine_tiles(std::vector const& column_sizes, + std::vector const& column_starts, + size_type const first_row_batch_size, + size_type const total_number_of_rows, + size_type const& shmem_limit_per_tile, + TileCallback f) +{ // tile infos are organized with the tile going "down" the columns this provides the most // coalescing of memory access - int current_tile_width = 0; + int current_tile_width = 0; int current_tile_start_col = 0; // the ideal tile height has lots of 8-byte reads and 8-byte writes. The optimal read/write would @@ -1676,10 +1781,10 @@ void determine_tiles(std::vector const &column_sizes, // sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we want them // equal, so height and width are sqrt(shared_mem_size). The trick is that it's in bytes, not rows // or columns. - auto const square_bias = 32; // bias towards columns for performance reasons - auto const optimal_square_len = static_cast(sqrt(shmem_limit_per_tile)); + auto const square_bias = 32; // bias towards columns for performance reasons + auto const optimal_square_len = static_cast(sqrt(shmem_limit_per_tile)); auto const desired_tile_height = util::round_up_safe( - std::min(optimal_square_len / square_bias, total_number_of_rows), cudf::detail::warp_size); + std::min(optimal_square_len / square_bias, total_number_of_rows), cudf::detail::warp_size); auto const tile_height = std::clamp(desired_tile_height, 1, first_row_batch_size); int row_size = 0; @@ -1689,22 +1794,22 @@ void determine_tiles(std::vector const &column_sizes, auto const col_size = column_sizes[col]; // align size for this type - auto const alignment_needed = col_size; // They are the same for fixed width types - auto const row_size_aligned = util::round_up_unsafe(row_size, alignment_needed); + auto const alignment_needed = col_size; // They are the same for fixed width types + auto const row_size_aligned = util::round_up_unsafe(row_size, alignment_needed); auto const row_size_with_this_col = row_size_aligned + col_size; auto const row_size_with_end_pad = - util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT); + util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT); if (row_size_with_end_pad * tile_height > shmem_limit_per_tile) { // too large, close this tile, generate vertical tiles and restart f(current_tile_start_col, col == 0 ? col : col - 1, tile_height); row_size = - util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed); - row_size += col_size; // alignment required for shared memory tile boundary to match alignment - // of output row + util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed); + row_size += col_size; // alignment required for shared memory tile boundary to match + // alignment of output row current_tile_start_col = col; - current_tile_width = 0; + current_tile_width = 0; } else { row_size = row_size_with_this_col; current_tile_width++; @@ -1732,155 +1837,196 @@ void determine_tiles(std::vector const &column_sizes, */ template std::vector> convert_to_rows( - table_view const &tbl, batch_data &batch_info, offsetFunctor offset_functor, - column_info_s const &column_info, - std::optional> variable_width_offsets, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { + table_view const& tbl, + batch_data& batch_info, + offsetFunctor offset_functor, + column_info_s const& column_info, + std::optional> variable_width_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ int device_id; CUDF_CUDA_TRY(cudaGetDevice(&device_id)); int total_shmem_in_bytes; CUDF_CUDA_TRY( - cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); -#ifndef __CUDA_ARCH__ // __host__ code. +#ifndef __CUDA_ARCH__ // __host__ code. // Need to reduce total shmem available by the size of barriers in the kernel's shared memory total_shmem_in_bytes -= - util::round_up_unsafe(sizeof(cuda::barrier), 16ul); -#endif // __CUDA_ARCH__ + util::round_up_unsafe(sizeof(cuda::barrier), 16ul); +#endif // __CUDA_ARCH__ auto const shmem_limit_per_tile = total_shmem_in_bytes; - auto const num_rows = tbl.num_rows(); + auto const num_rows = tbl.num_rows(); auto const fixed_width_only = !variable_width_offsets.has_value(); - auto select_columns = [](auto const &tbl, auto column_predicate) { + auto select_columns = [](auto const& tbl, auto column_predicate) { std::vector cols; - std::copy_if(tbl.begin(), tbl.end(), std::back_inserter(cols), - [&](auto c) { return column_predicate(c); }); + std::copy_if(tbl.begin(), tbl.end(), std::back_inserter(cols), [&](auto c) { + return column_predicate(c); + }); return table_view(cols); }; - auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream, - rmm::mr::get_current_device_resource()); - auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream, - rmm::mr::get_current_device_resource()); + auto dev_col_sizes = make_device_uvector_async( + column_info.column_sizes, stream, rmm::mr::get_current_device_resource()); + auto dev_col_starts = make_device_uvector_async( + column_info.column_starts, stream, rmm::mr::get_current_device_resource()); // Get the pointers to the input columnar data ready - auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const &c) { + auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const& c) { return is_compound(c.type()) ? nullptr : c.template data(); }); - std::vector input_data(data_begin, data_begin + tbl.num_columns()); + std::vector input_data(data_begin, data_begin + tbl.num_columns()); // validity code handles variable and fixed-width data, so give it everything auto const nm_begin = - thrust::make_transform_iterator(tbl.begin(), [](auto const &c) { return c.null_mask(); }); - std::vector input_nm(nm_begin, nm_begin + tbl.num_columns()); + thrust::make_transform_iterator(tbl.begin(), [](auto const& c) { return c.null_mask(); }); + std::vector input_nm(nm_begin, nm_begin + tbl.num_columns()); auto dev_input_data = - make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource()); auto dev_input_nm = - make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource()); // the first batch always exists unless we were sent an empty table auto const first_batch_size = batch_info.row_batches[0].row_count; std::vector output_buffers; - std::vector output_data; + std::vector output_data; output_data.reserve(batch_info.row_batches.size()); output_buffers.reserve(batch_info.row_batches.size()); - std::transform(batch_info.row_batches.begin(), batch_info.row_batches.end(), - std::back_inserter(output_buffers), [&](auto const &batch) { - return rmm::device_buffer(batch.num_bytes, stream, mr); - }); - std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data), - [](auto &buf) { return static_cast(buf.data()); }); + std::transform( + batch_info.row_batches.begin(), + batch_info.row_batches.end(), + std::back_inserter(output_buffers), + [&](auto const& batch) { return rmm::device_buffer(batch.num_bytes, stream, mr); }); + std::transform( + output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data), [](auto& buf) { + return static_cast(buf.data()); + }); auto dev_output_data = make_device_uvector_async(output_data, stream, mr); int info_count = 0; detail::determine_tiles( - column_info.column_sizes, column_info.column_starts, first_batch_size, num_rows, - shmem_limit_per_tile, - [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count, - &stream](int const start_col, int const end_col, int const tile_height) { - int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream); - info_count += i; - }); + column_info.column_sizes, + column_info.column_starts, + first_batch_size, + num_rows, + shmem_limit_per_tile, + [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count, &stream]( + int const start_col, int const end_col, int const tile_height) { + int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream); + info_count += i; + }); // allocate space for tiles device_uvector gpu_tile_infos(info_count, stream); int tile_offset = 0; detail::determine_tiles( - column_info.column_sizes, column_info.column_starts, first_batch_size, num_rows, - shmem_limit_per_tile, - [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &gpu_tile_infos, num_rows, - &tile_offset, stream](int const start_col, int const end_col, int const tile_height) { - tile_offset += detail::build_tiles( - {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset}, - gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream); - }); + column_info.column_sizes, + column_info.column_starts, + first_batch_size, + num_rows, + shmem_limit_per_tile, + [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, + &gpu_tile_infos, + num_rows, + &tile_offset, + stream](int const start_col, int const end_col, int const tile_height) { + tile_offset += detail::build_tiles( + {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset}, + gpu_batch_row_boundaries, + start_col, + end_col, + tile_height, + num_rows, + stream); + }); // build validity tiles for ALL columns, variable and fixed width. auto validity_tile_infos = detail::build_validity_tile_infos( - tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches); + tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches); - auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream, - rmm::mr::get_current_device_resource()); + auto dev_validity_tile_infos = + make_device_uvector_async(validity_tile_infos, stream, rmm::mr::get_current_device_resource()); auto const validity_offset = column_info.column_starts.back(); // blast through the entire table and convert it - detail::copy_to_rows<<>>( - num_rows, tbl.num_columns(), shmem_limit_per_tile, gpu_tile_infos, dev_input_data.data(), - dev_col_sizes.data(), dev_col_starts.data(), offset_functor, - batch_info.d_batch_row_boundaries.data(), - reinterpret_cast(dev_output_data.data())); + detail::copy_to_rows<<>>(num_rows, + tbl.num_columns(), + shmem_limit_per_tile, + gpu_tile_infos, + dev_input_data.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + offset_functor, + batch_info.d_batch_row_boundaries.data(), + reinterpret_cast(dev_output_data.data())); // note that validity gets the entire table and not the fixed-width portion detail::copy_validity_to_rows<<>>( - num_rows, tbl.num_columns(), shmem_limit_per_tile, offset_functor, - batch_info.d_batch_row_boundaries.data(), dev_output_data.data(), validity_offset, - dev_validity_tile_infos, dev_input_nm.data()); + total_shmem_in_bytes, + stream.value()>>>(num_rows, + tbl.num_columns(), + shmem_limit_per_tile, + offset_functor, + batch_info.d_batch_row_boundaries.data(), + dev_output_data.data(), + validity_offset, + dev_validity_tile_infos, + dev_input_nm.data()); if (!fixed_width_only) { // build table view for variable-width data only auto const variable_width_table = - select_columns(tbl, [](auto col) { return is_compound(col.type()); }); + select_columns(tbl, [](auto col) { return is_compound(col.type()); }); CUDF_EXPECTS(!variable_width_table.is_empty(), "No variable-width columns when expected!"); CUDF_EXPECTS(variable_width_offsets.has_value(), "No variable width offset data!"); auto const variable_data_begin = - thrust::make_transform_iterator(variable_width_table.begin(), [](auto const &c) { - strings_column_view const scv{c}; - return is_compound(c.type()) ? scv.chars().template data() : nullptr; - }); - std::vector variable_width_input_data( - variable_data_begin, variable_data_begin + variable_width_table.num_columns()); + thrust::make_transform_iterator(variable_width_table.begin(), [](auto const& c) { + strings_column_view const scv{c}; + return is_compound(c.type()) ? scv.chars().template data() : nullptr; + }); + std::vector variable_width_input_data( + variable_data_begin, variable_data_begin + variable_width_table.num_columns()); auto dev_variable_input_data = make_device_uvector_async( - variable_width_input_data, stream, rmm::mr::get_current_device_resource()); + variable_width_input_data, stream, rmm::mr::get_current_device_resource()); auto dev_variable_col_output_offsets = make_device_uvector_async( - column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource()); + column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource()); for (uint i = 0; i < batch_info.row_batches.size(); i++) { auto const batch_row_offset = batch_info.batch_row_boundaries[i]; - auto const batch_num_rows = batch_info.row_batches[i].row_count; - - dim3 const string_blocks(std::min( - MAX_STRING_BLOCKS, - util::div_rounding_up_unsafe(batch_num_rows, NUM_STRING_ROWS_PER_BLOCK_TO_ROWS))); - - detail::copy_strings_to_rows<<>>( - batch_num_rows, variable_width_table.num_columns(), dev_variable_input_data.data(), - dev_variable_col_output_offsets.data(), variable_width_offsets->data(), - column_info.size_per_row, offset_functor, batch_row_offset, - reinterpret_cast(output_data[i])); + auto const batch_num_rows = batch_info.row_batches[i].row_count; + + dim3 const string_blocks( + std::min(MAX_STRING_BLOCKS, + util::div_rounding_up_unsafe(batch_num_rows, NUM_STRING_ROWS_PER_BLOCK_TO_ROWS))); + + detail::copy_strings_to_rows<<>>(batch_num_rows, + variable_width_table.num_columns(), + dev_variable_input_data.data(), + dev_variable_col_output_offsets.data(), + variable_width_offsets->data(), + column_info.size_per_row, + offset_functor, + batch_row_offset, + reinterpret_cast(output_data[i])); } } @@ -1889,26 +2035,36 @@ std::vector> convert_to_rows( std::vector> ret; ret.reserve(batch_info.row_batches.size()); auto counting_iter = thrust::make_counting_iterator(0); - std::transform(counting_iter, counting_iter + batch_info.row_batches.size(), - std::back_inserter(ret), [&](auto batch) { + std::transform(counting_iter, + counting_iter + batch_info.row_batches.size(), + std::back_inserter(ret), + [&](auto batch) { auto const offset_count = batch_info.row_batches[batch].row_offsets.size(); auto offsets = - std::make_unique(data_type{type_id::INT32}, (size_type)offset_count, - batch_info.row_batches[batch].row_offsets.release(), - rmm::device_buffer{}, 0); - auto data = std::make_unique( - data_type{type_id::INT8}, batch_info.row_batches[batch].num_bytes, - std::move(output_buffers[batch]), rmm::device_buffer{}, 0); - - return make_lists_column( - batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data), - 0, rmm::device_buffer{0, cudf::get_default_stream(), mr}, stream, mr); + std::make_unique(data_type{type_id::INT32}, + (size_type)offset_count, + batch_info.row_batches[batch].row_offsets.release(), + rmm::device_buffer{}, + 0); + auto data = std::make_unique(data_type{type_id::INT8}, + batch_info.row_batches[batch].num_bytes, + std::move(output_buffers[batch]), + rmm::device_buffer{}, + 0); + + return make_lists_column(batch_info.row_batches[batch].row_count, + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, cudf::get_default_stream(), mr}, + stream, + mr); }); return ret; } -} // namespace detail +} // namespace detail /** * @brief convert a cudf table to JCUDF row format @@ -1918,14 +2074,15 @@ std::vector> convert_to_rows( * @param mr memory resource used for returned data * @return vector of list columns containing byte columns of the JCUDF row data */ -std::vector> convert_to_rows(table_view const &tbl, +std::vector> convert_to_rows(table_view const& tbl, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { + rmm::mr::device_memory_resource* mr) +{ auto const num_columns = tbl.num_columns(); - auto const num_rows = tbl.num_rows(); + auto const num_rows = tbl.num_rows(); auto const fixed_width_only = std::all_of( - tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); }); + tbl.begin(), tbl.end(), [](column_view const& c) { return is_fixed_width(c.type()); }); // Break up the work into tiles, which are a starting and ending row/col #. This tile size is // calculated based on the shared memory size available we want a single tile to fill up the @@ -1941,94 +2098,107 @@ std::vector> convert_to_rows(table_view const &tbl, // before building the tiles so the tiles can be properly cut around them. auto schema_column_iter = - thrust::make_transform_iterator(tbl.begin(), [](auto const &i) { return i.type(); }); + thrust::make_transform_iterator(tbl.begin(), [](auto const& i) { return i.type(); }); auto column_info = - detail::compute_column_information(schema_column_iter, schema_column_iter + num_columns); + detail::compute_column_information(schema_column_iter, schema_column_iter + num_columns); auto const size_per_row = column_info.size_per_row; if (fixed_width_only) { // total encoded row size. This includes fixed-width data and validity only. It does not include // variable-width data since it isn't copied with the fixed-width and validity kernel. auto row_size_iter = thrust::make_constant_iterator( - util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT)); + util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT)); auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); detail::fixed_width_row_offset_functor offset_functor( - util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT)); + util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT)); - return detail::convert_to_rows(tbl, batch_info, offset_functor, std::move(column_info), - std::nullopt, stream, mr); + return detail::convert_to_rows( + tbl, batch_info, offset_functor, std::move(column_info), std::nullopt, stream, mr); } else { auto offset_data = detail::build_string_row_offsets(tbl, size_per_row, stream); - auto &row_sizes = std::get<0>(offset_data); + auto& row_sizes = std::get<0>(offset_data); auto row_size_iter = cudf::detail::make_counting_transform_iterator( - 0, detail::row_size_functor(num_rows, row_sizes.data(), 0)); + 0, detail::row_size_functor(num_rows, row_sizes.data(), 0)); auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); detail::string_row_offset_functor offset_functor(batch_info.batch_row_offsets); - return detail::convert_to_rows(tbl, batch_info, offset_functor, std::move(column_info), - std::make_optional(std::move(std::get<1>(offset_data))), stream, + return detail::convert_to_rows(tbl, + batch_info, + offset_functor, + std::move(column_info), + std::make_optional(std::move(std::get<1>(offset_data))), + stream, mr); } } -std::vector> -convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +std::vector> convert_to_rows_fixed_width_optimized( + table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) +{ auto const num_columns = tbl.num_columns(); std::vector schema; schema.resize(num_columns); - std::transform(tbl.begin(), tbl.end(), schema.begin(), - [](auto i) -> data_type { return i.type(); }); + std::transform( + tbl.begin(), tbl.end(), schema.begin(), [](auto i) -> data_type { return i.type(); }); if (detail::are_all_fixed_width(schema)) { std::vector column_start; std::vector column_size; int32_t const size_per_row = - detail::compute_fixed_width_layout(schema, column_start, column_size); + detail::compute_fixed_width_layout(schema, column_start, column_size); auto dev_column_start = make_device_uvector_async(column_start, stream, mr); - auto dev_column_size = make_device_uvector_async(column_size, stream, mr); + auto dev_column_size = make_device_uvector_async(column_size, stream, mr); // Make the number of rows per batch a multiple of 32 so we don't have to worry about splitting // validity at a specific row offset. This might change in the future. auto const max_rows_per_batch = - util::round_down_safe(std::numeric_limits::max() / size_per_row, 32); + util::round_down_safe(std::numeric_limits::max() / size_per_row, 32); auto const num_rows = tbl.num_rows(); // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; + std::vector input_data; + std::vector input_nm; for (size_type column_number = 0; column_number < num_columns; column_number++) { column_view cv = tbl.column(column_number); input_data.emplace_back(cv.data()); input_nm.emplace_back(cv.null_mask()); } auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); using ScalarType = scalar_type_t; - auto zero = make_numeric_scalar(data_type(type_id::INT32), stream.value()); + auto zero = make_numeric_scalar(data_type(type_id::INT32), stream.value()); zero->set_valid_async(true, stream); - static_cast(zero.get())->set_value(0, stream); + static_cast(zero.get())->set_value(0, stream); auto step = make_numeric_scalar(data_type(type_id::INT32), stream.value()); step->set_valid_async(true, stream); - static_cast(step.get())->set_value(static_cast(size_per_row), stream); + static_cast(step.get())->set_value(static_cast(size_per_row), stream); std::vector> ret; for (size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { size_type row_count = num_rows - row_start; - row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; - ret.emplace_back(detail::fixed_width_convert_to_rows( - row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size, - dev_input_data, dev_input_nm, *zero, *step, stream, mr)); + row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; + ret.emplace_back(detail::fixed_width_convert_to_rows(row_start, + row_count, + num_columns, + size_per_row, + dev_column_start, + dev_column_size, + dev_input_data, + dev_input_nm, + *zero, + *step, + stream, + mr)); } return ret; @@ -2037,6 +2207,19 @@ convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_vi } } +namespace { + +/// @brief Calculates and sets null counts for specified columns +void fixup_null_counts(std::vector>& output_columns, + rmm::cuda_stream_view stream) +{ + for (auto& col : output_columns) { + col->set_null_count(cudf::detail::null_count(col->view().null_mask(), 0, col->size(), stream)); + } +} + +} // namespace + /** * @brief convert from JCUDF row format to cudf columns * @@ -2046,12 +2229,13 @@ convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_vi * @param mr memory resource for returned data * @return cudf table of the data */ -std::unique_ptr convert_from_rows(lists_column_view const &input, - std::vector const &schema, +std::unique_ptr
convert_from_rows(lists_column_view const& input, + std::vector const& schema, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { + rmm::mr::device_memory_resource* mr) +{ // verify that the types are what we expect - column_view child = input.child(); + column_view child = input.child(); auto const list_type = child.type().id(); CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8, "Only a list of bytes is supported as input"); @@ -2071,19 +2255,19 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, } auto const num_columns = string_schema.size(); - auto const num_rows = input.parent().size(); + auto const num_rows = input.parent().size(); int device_id; CUDF_CUDA_TRY(cudaGetDevice(&device_id)); int total_shmem_in_bytes; CUDF_CUDA_TRY( - cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); -#ifndef __CUDA_ARCH__ // __host__ code. +#ifndef __CUDA_ARCH__ // __host__ code. // Need to reduce total shmem available by the size of barriers in the kernel's shared memory total_shmem_in_bytes -= - util::round_up_unsafe(sizeof(cuda::barrier), 16ul); -#endif // __CUDA_ARCH__ + util::round_up_unsafe(sizeof(cuda::barrier), 16ul); +#endif // __CUDA_ARCH__ auto const shmem_limit_per_tile = total_shmem_in_bytes; @@ -2093,41 +2277,44 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, // Ideally we would check that the offsets are all the same, etc. but for now this is probably // fine CUDF_EXPECTS(size_per_row * num_rows <= child.size(), "The layout of the data appears to be off"); - auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream, - rmm::mr::get_current_device_resource()); - auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream, - rmm::mr::get_current_device_resource()); + auto dev_col_starts = make_device_uvector_async( + column_info.column_starts, stream, rmm::mr::get_current_device_resource()); + auto dev_col_sizes = make_device_uvector_async( + column_info.column_sizes, stream, rmm::mr::get_current_device_resource()); // Allocate the columns we are going to write into std::vector> output_columns; std::vector> string_row_offset_columns; std::vector> string_length_columns; - std::vector output_data; - std::vector output_nm; - std::vector string_row_offsets; - std::vector string_lengths; + std::vector output_data; + std::vector output_nm; + std::vector string_row_offsets; + std::vector string_lengths; for (auto i : schema) { - auto make_col = [&output_data, &output_nm](data_type type, size_type num_rows, bool include_nm, + auto make_col = [&output_data, &output_nm](data_type type, + size_type num_rows, + bool include_nm, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { - auto column = make_fixed_width_column( - type, num_rows, include_nm ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, stream, - mr); + rmm::mr::device_memory_resource* mr) { + auto column = + make_fixed_width_column(type, + num_rows, + include_nm ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, + stream, + mr); auto mut = column->mutable_view(); output_data.emplace_back(mut.data()); - if (include_nm) { - output_nm.emplace_back(mut.null_mask()); - } + if (include_nm) { output_nm.emplace_back(mut.null_mask()); } return column; }; if (i.id() == type_id::STRING) { auto const int32type = data_type(type_id::INT32); auto offset_col = - make_col(int32type, num_rows, true, stream, rmm::mr::get_current_device_resource()); + make_col(int32type, num_rows, true, stream, rmm::mr::get_current_device_resource()); string_row_offsets.push_back(offset_col->mutable_view().data()); string_row_offset_columns.emplace_back(std::move(offset_col)); auto length_col = - make_col(int32type, num_rows, false, stream, rmm::mr::get_current_device_resource()); + make_col(int32type, num_rows, false, stream, rmm::mr::get_current_device_resource()); string_lengths.push_back(length_col->mutable_view().data()); string_length_columns.emplace_back(std::move(length_col)); // placeholder @@ -2138,160 +2325,214 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, } auto dev_string_row_offsets = - make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource()); auto dev_string_lengths = - make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource()); // build the row_batches from the passed in list column std::vector row_batches; row_batches.push_back( - {detail::row_batch{child.size(), num_rows, device_uvector(0, stream)}}); + {detail::row_batch{child.size(), num_rows, device_uvector(0, stream)}}); auto dev_output_data = - make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource()); auto dev_output_nm = - make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource()); // only ever get a single batch when going from rows, so boundaries are 0, num_rows constexpr auto num_batches = 2; device_uvector gpu_batch_row_boundaries(num_batches, stream); - thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_batches), gpu_batch_row_boundaries.begin(), + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_batches), + gpu_batch_row_boundaries.begin(), cuda::proclaim_return_type( - [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; })); + [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; })); int info_count = 0; - detail::determine_tiles( - column_info.column_sizes, column_info.column_starts, num_rows, num_rows, shmem_limit_per_tile, - [&gpu_batch_row_boundaries, &info_count, &stream](int const start_col, int const end_col, - int const tile_height) { - info_count += detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream); - }); + detail::determine_tiles(column_info.column_sizes, + column_info.column_starts, + num_rows, + num_rows, + shmem_limit_per_tile, + [&gpu_batch_row_boundaries, &info_count, &stream]( + int const start_col, int const end_col, int const tile_height) { + info_count += detail::compute_tile_counts( + gpu_batch_row_boundaries, tile_height, stream); + }); // allocate space for tiles device_uvector gpu_tile_infos(info_count, stream); int tile_offset = 0; detail::determine_tiles( - column_info.column_sizes, column_info.column_starts, num_rows, num_rows, shmem_limit_per_tile, - [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset, - stream](int const start_col, int const end_col, int const tile_height) { - tile_offset += detail::build_tiles( - {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset}, - gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream); - }); + column_info.column_sizes, + column_info.column_starts, + num_rows, + num_rows, + shmem_limit_per_tile, + [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset, stream]( + int const start_col, int const end_col, int const tile_height) { + tile_offset += detail::build_tiles( + {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset}, + gpu_batch_row_boundaries, + start_col, + end_col, + tile_height, + num_rows, + stream); + }); dim3 const blocks(gpu_tile_infos.size()); // validity needs to be calculated based on the actual number of final table columns auto validity_tile_infos = - detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches); + detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches); - auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream, - rmm::mr::get_current_device_resource()); + auto dev_validity_tile_infos = + make_device_uvector_async(validity_tile_infos, stream, rmm::mr::get_current_device_resource()); dim3 const validity_blocks(validity_tile_infos.size()); if (dev_string_row_offsets.size() == 0) { detail::fixed_width_row_offset_functor offset_functor(size_per_row); - detail::copy_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_tile, offset_functor, - gpu_batch_row_boundaries.data(), dev_output_data.data(), dev_col_sizes.data(), - dev_col_starts.data(), gpu_tile_infos, child.data()); + detail::copy_from_rows<<>>(num_rows, + num_columns, + shmem_limit_per_tile, + offset_functor, + gpu_batch_row_boundaries.data(), + dev_output_data.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + gpu_tile_infos, + child.data()); detail::copy_validity_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_tile, offset_functor, - gpu_batch_row_boundaries.data(), dev_output_nm.data(), column_info.column_starts.back(), - dev_validity_tile_infos, child.data()); + total_shmem_in_bytes, + stream.value()>>>(num_rows, + num_columns, + shmem_limit_per_tile, + offset_functor, + gpu_batch_row_boundaries.data(), + dev_output_nm.data(), + column_info.column_starts.back(), + dev_validity_tile_infos, + child.data()); } else { detail::string_row_offset_functor offset_functor(device_span{input.offsets()}); - detail::copy_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_tile, offset_functor, - gpu_batch_row_boundaries.data(), dev_output_data.data(), dev_col_sizes.data(), - dev_col_starts.data(), gpu_tile_infos, child.data()); + detail::copy_from_rows<<>>(num_rows, + num_columns, + shmem_limit_per_tile, + offset_functor, + gpu_batch_row_boundaries.data(), + dev_output_data.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + gpu_tile_infos, + child.data()); detail::copy_validity_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_tile, offset_functor, - gpu_batch_row_boundaries.data(), dev_output_nm.data(), column_info.column_starts.back(), - dev_validity_tile_infos, child.data()); + total_shmem_in_bytes, + stream.value()>>>(num_rows, + num_columns, + shmem_limit_per_tile, + offset_functor, + gpu_batch_row_boundaries.data(), + dev_output_nm.data(), + column_info.column_starts.back(), + dev_validity_tile_infos, + child.data()); std::vector> string_col_offsets; std::vector> string_data_cols; - std::vector string_col_offset_ptrs; - std::vector string_data_col_ptrs; - for (auto &col_string_lengths : string_lengths) { + std::vector string_col_offset_ptrs; + std::vector string_data_col_ptrs; + for (auto& col_string_lengths : string_lengths) { device_uvector output_string_offsets(num_rows + 1, stream, mr); - auto tmp = cuda::proclaim_return_type( - [num_rows, col_string_lengths] __device__(auto const &i) { - return i < num_rows ? col_string_lengths[i] : 0; - }); + auto tmp = cuda::proclaim_return_type( + [num_rows, col_string_lengths] __device__(auto const& i) { + return i < num_rows ? col_string_lengths[i] : 0; + }); auto bounded_iter = cudf::detail::make_counting_transform_iterator(0, tmp); - thrust::exclusive_scan(rmm::exec_policy(stream), bounded_iter, bounded_iter + num_rows + 1, + thrust::exclusive_scan(rmm::exec_policy(stream), + bounded_iter, + bounded_iter + num_rows + 1, output_string_offsets.begin()); // allocate destination string column - rmm::device_uvector string_data(output_string_offsets.element(num_rows, stream), stream, - mr); + rmm::device_uvector string_data( + output_string_offsets.element(num_rows, stream), stream, mr); string_col_offset_ptrs.push_back(output_string_offsets.data()); string_data_col_ptrs.push_back(string_data.data()); string_col_offsets.push_back(std::move(output_string_offsets)); string_data_cols.push_back(std::move(string_data)); } - auto dev_string_col_offsets = make_device_uvector_async(string_col_offset_ptrs, stream, - rmm::mr::get_current_device_resource()); - auto dev_string_data_cols = make_device_uvector_async(string_data_col_ptrs, stream, - rmm::mr::get_current_device_resource()); + auto dev_string_col_offsets = make_device_uvector_async( + string_col_offset_ptrs, stream, rmm::mr::get_current_device_resource()); + auto dev_string_data_cols = make_device_uvector_async( + string_data_col_ptrs, stream, rmm::mr::get_current_device_resource()); dim3 const string_blocks( - std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS), - MAX_STRING_BLOCKS)); + std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS), + MAX_STRING_BLOCKS)); - detail::copy_strings_from_rows<<>>( - offset_functor, dev_string_row_offsets.data(), dev_string_lengths.data(), - dev_string_col_offsets.data(), dev_string_data_cols.data(), child.data(), num_rows, - static_cast(string_col_offsets.size())); + offset_functor, + dev_string_row_offsets.data(), + dev_string_lengths.data(), + dev_string_col_offsets.data(), + dev_string_data_cols.data(), + child.data(), + num_rows, + static_cast(string_col_offsets.size())); // merge strings back into output_columns int string_idx = 0; for (int i = 0; i < static_cast(schema.size()); ++i) { if (schema[i].id() == type_id::STRING) { // stuff real string column - auto const null_count = string_row_offset_columns[string_idx]->null_count(); - auto string_data = string_row_offset_columns[string_idx].release()->release(); - output_columns[i] = make_strings_column( - num_rows, - std::make_unique(std::move(string_col_offsets[string_idx]), - rmm::device_buffer{}, 0), - std::make_unique(std::move(string_data_cols[string_idx]), - rmm::device_buffer{}, 0), - null_count, std::move(*string_data.null_mask.release())); + auto string_data = string_row_offset_columns[string_idx].release()->release(); + output_columns[i] = make_strings_column(num_rows, + std::move(string_col_offsets[string_idx]), + std::move(string_data_cols[string_idx]), + std::move(*string_data.null_mask.release()), + 0); + // Null count set to 0, temporarily. Will be fixed up before return. string_idx++; } } } - for (auto &col : output_columns) { - col->set_null_count(cudf::null_count(col->view().null_mask(), 0, col->size())); - } + // Set null counts, because output_columns are modified via mutable-view, + // in the kernel above. + // TODO(future): Consider setting null count in the kernel itself. + fixup_null_counts(output_columns, stream); + return std::make_unique
(std::move(output_columns)); } -std::unique_ptr
convert_from_rows_fixed_width_optimized( - lists_column_view const &input, std::vector const &schema, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { +std::unique_ptr
convert_from_rows_fixed_width_optimized(lists_column_view const& input, + std::vector const& schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ // verify that the types are what we expect - column_view child = input.child(); + column_view child = input.child(); auto const list_type = child.type().id(); CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8, "Only a list of bytes is supported as input"); @@ -2302,7 +2543,7 @@ std::unique_ptr
convert_from_rows_fixed_width_optimized( std::vector column_start; std::vector column_size; - auto const num_rows = input.parent().size(); + auto const num_rows = input.parent().size(); auto const size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); // Ideally we would check that the offsets are all the same, etc. but for now this is probably @@ -2310,17 +2551,17 @@ std::unique_ptr
convert_from_rows_fixed_width_optimized( CUDF_EXPECTS(size_per_row * num_rows == child.size(), "The layout of the data appears to be off"); auto dev_column_start = - make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource()); auto dev_column_size = - make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource()); + make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource()); // Allocate the columns we are going to write into std::vector> output_columns; - std::vector output_data; - std::vector output_nm; + std::vector output_data; + std::vector output_nm; for (int i = 0; i < static_cast(num_columns); i++) { auto column = - make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr); + make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr); auto mut = column->mutable_view(); output_data.emplace_back(mut.data()); output_nm.emplace_back(mut.null_mask()); @@ -2328,26 +2569,32 @@ std::unique_ptr
convert_from_rows_fixed_width_optimized( } auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); dim3 blocks; dim3 threads; int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); detail::copy_from_rows_fixed_width_optimized<<>>( - num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(), - dev_output_data.data(), dev_output_nm.data(), child.data()); + num_rows, + num_columns, + size_per_row, + dev_column_start.data(), + dev_column_size.data(), + dev_output_data.data(), + dev_output_nm.data(), + child.data()); + + // Set null counts, because output_columns are modified via mutable-view, + // in the kernel above. + // TODO(future): Consider setting null count in the kernel itself. + fixup_null_counts(output_columns, stream); - for (auto &col : output_columns) { - col->set_null_count(cudf::null_count(col->view().null_mask(), 0, col->size())); - } return std::make_unique
(std::move(output_columns)); } else { CUDF_FAIL("Only fixed width types are currently supported"); } } -} // namespace jni - -} // namespace cudf +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index af4ab8c2485..40d745338f4 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -257,6 +257,8 @@ ConfigureTest( transform/one_hot_encode_tests.cpp ) +ConfigureTest(ROW_CONVERSION_TEST transform/row_conversion.cpp) + # ################################################################################################## # * interop tests ------------------------------------------------------------------------- ConfigureTest( diff --git a/cpp/tests/transform/row_conversion.cpp b/cpp/tests/transform/row_conversion.cpp new file mode 100644 index 00000000000..6d8f714401c --- /dev/null +++ b/cpp/tests/transform/row_conversion.cpp @@ -0,0 +1,1021 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +struct ColumnToRowTests : public cudf::test::BaseFixture {}; +struct RowToColumnTests : public cudf::test::BaseFixture {}; + +TEST_F(ColumnToRowTests, Single) +{ + cudf::test::fixed_width_column_wrapper a({-1}); + cudf::table_view in(std::vector{a}); + std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); ++i) { + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(ColumnToRowTests, SimpleString) +{ + cudf::test::fixed_width_column_wrapper a({-1, 0, 1, 0, -1}); + cudf::test::strings_column_wrapper b( + {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"}); + cudf::table_view in(std::vector{a, b}); + std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; + + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(new_rows[0]->size(), 5); +} + +TEST_F(ColumnToRowTests, DoubleString) +{ + cudf::test::strings_column_wrapper a( + {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"}); + cudf::test::fixed_width_column_wrapper b({0, 1, 2, 3, 4}); + cudf::test::strings_column_wrapper c({"world", + "hello", + "this string isn't as long", + "this one isn't so short though when you think about it", + "dlrow"}); + cudf::table_view in(std::vector{a, b, c}); + + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(new_rows[0]->size(), 5); +} + +TEST_F(ColumnToRowTests, BigStrings) +{ + char const* TEST_STRINGS[] = { + "These", + "are", + "the", + "test", + "strings", + "that", + "we", + "have", + "some are really long", + "and some are kinda short", + "They are all over on purpose with different sizes for the strings in order to test the code " + "on all different lengths of strings", + "a", + "good test", + "is required to produce reasonable confidence that this is working"}; + auto num_generator = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + auto string_generator = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* { + return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))]; + }); + + auto const num_rows = 50; + auto const num_cols = 50; + std::vector schema; + + std::vector cols; + std::vector views; + + for (auto col = 0; col < num_cols; ++col) { + if (rand() % 2) { + cols.emplace_back( + cudf::test::fixed_width_column_wrapper(num_generator, num_generator + num_rows)); + views.push_back(cols.back()); + schema.emplace_back(cudf::data_type{cudf::type_id::INT32}); + } else { + cols.emplace_back( + cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows)); + views.push_back(cols.back()); + schema.emplace_back(cudf::type_id::STRING); + } + } + + cudf::table_view in(views); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(new_rows[0]->size(), num_rows); +} + +TEST_F(ColumnToRowTests, ManyStrings) +{ + char const* TEST_STRINGS[] = { + "These", + "are", + "the", + "test", + "strings", + "that", + "we", + "have", + "some are really long", + "and some are kinda short", + "They are all over on purpose with different sizes for the strings in order to test the code " + "on all different lengths of strings", + "a", + "good test", + "is required to produce reasonable confidence that this is working", + "some strings", + "are split into multiple strings", + "some strings have all their data", + "lots of choices of strings and sizes is sure to test the offset calculation code to ensure " + "that even a really long string ends up in the correct spot for the final destination allowing " + "for even crazy run-on sentences to be inserted into the data"}; + auto num_generator = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + auto string_generator = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* { + return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))]; + }); + + auto const num_rows = 1000000; + auto const num_cols = 50; + std::vector schema; + + std::vector cols; + std::vector views; + + for (auto col = 0; col < num_cols; ++col) { + if (rand() % 2) { + cols.emplace_back( + cudf::test::fixed_width_column_wrapper(num_generator, num_generator + num_rows)); + views.push_back(cols.back()); + schema.emplace_back(cudf::data_type{cudf::type_id::INT32}); + } else { + cols.emplace_back( + cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows)); + views.push_back(cols.back()); + schema.emplace_back(cudf::type_id::STRING); + } + } + + cudf::table_view in(views); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(new_rows[0]->size(), num_rows); +} + +TEST_F(ColumnToRowTests, Simple) +{ + cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); + cudf::table_view in(std::vector{a}); + std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(ColumnToRowTests, Tall) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); + cudf::table_view in(std::vector{a}); + std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(ColumnToRowTests, Wide) +{ + std::vector> cols; + std::vector views; + std::vector schema; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(ColumnToRowTests, SingleByteWide) +{ + std::vector> cols; + std::vector views; + std::vector schema; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + + schema.push_back(cudf::data_type{cudf::type_id::INT8}); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(ColumnToRowTests, Non2Power) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + constexpr auto num_rows = 6 * 1024 + 557; + for (int i = 0; i < 131; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + for (int j = 0; j < old_tbl->num_columns(); ++j) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(ColumnToRowTests, Big) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 28 columns of 1 million rows + constexpr auto num_rows = 1024 * 1024; + for (int i = 0; i < 28; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + for (int j = 0; j < old_tbl->num_columns(); ++j) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(ColumnToRowTests, Bigger) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 128 columns of 1 million rows + constexpr auto num_rows = 1024 * 1024; + for (int i = 0; i < 128; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + for (int j = 0; j < old_tbl->num_columns(); ++j) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(ColumnToRowTests, Biggest) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 128 columns of 2 million rows + constexpr auto num_rows = 2 * 1024 * 1024; + for (int i = 0; i < 128; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + for (int j = 0; j < old_tbl->num_columns(); ++j) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Single) +{ + cudf::test::fixed_width_column_wrapper a({-1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema{cudf::data_type{cudf::type_id::INT32}}; + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Simple) +{ + cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + std::vector schema{cudf::data_type{cudf::type_id::INT32}}; + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Tall) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Wide) +{ + std::vector> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({i})); // rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, SingleByteWide) +{ + std::vector> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, AllTypes) +{ + std::vector> cols; + std::vector views; + std::vector schema{cudf::data_type{cudf::type_id::INT64}, + cudf::data_type{cudf::type_id::FLOAT64}, + cudf::data_type{cudf::type_id::INT8}, + cudf::data_type{cudf::type_id::BOOL8}, + cudf::data_type{cudf::type_id::FLOAT32}, + cudf::data_type{cudf::type_id::INT8}, + cudf::data_type{cudf::type_id::INT32}, + cudf::data_type{cudf::type_id::INT64}}; + + cudf::test::fixed_width_column_wrapper c0({3, 9, 4, 2, 20, 0}, {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c1({5.0, 9.5, 0.9, 7.23, 2.8, 0.0}, + {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c2({5, 1, 0, 2, 7, 0}, {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c3({true, false, false, true, false, false}, + {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c4({1.0f, 3.5f, 5.9f, 7.1f, 9.8f, 0.0f}, + {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c5({2, 3, 4, 5, 9, 0}, {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_point_column_wrapper c6( + {-300, 500, 950, 90, 723, 0}, {1, 1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-2}); + cudf::test::fixed_point_column_wrapper c7( + {-80, 30, 90, 20, 200, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-1}); + + cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7}); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, AllTypesLarge) +{ + std::vector cols; + std::vector schema{}; + + // 15 columns of each type with 1 million entries + constexpr int num_rows{1024 * 1024 * 1}; + + std::default_random_engine re; + std::uniform_real_distribution rand_double(std::numeric_limits::min(), + std::numeric_limits::max()); + std::uniform_int_distribution rand_int64(std::numeric_limits::min(), + std::numeric_limits::max()); + auto r = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) -> int64_t { return rand_int64(re); }); + auto d = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) -> double { return rand_double(re); }); + + auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); + auto none_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; }); + auto most_valid = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return rand() % 2 == 0 ? 0 : 1; }); + auto few_valid = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return rand() % 13 == 0 ? 1 : 0; }); + + for (int i = 0; i < 15; ++i) { + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, all_valid) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::INT8}); + } + + for (int i = 0; i < 15; ++i) { + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::INT16}); + } + + for (int i = 0; i < 15; ++i) { + if (i < 5) { + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) + .release() + .release()); + } else { + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, none_valid) + .release() + .release()); + } + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + + for (int i = 0; i < 15; ++i) { + cols.push_back(*cudf::test::fixed_width_column_wrapper(d, d + num_rows, most_valid) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::FLOAT32}); + } + + for (int i = 0; i < 15; ++i) { + cols.push_back(*cudf::test::fixed_width_column_wrapper(d, d + num_rows, most_valid) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::FLOAT64}); + } + + for (int i = 0; i < 15; ++i) { + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::BOOL8}); + } + + for (int i = 0; i < 15; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper( + r, r + num_rows, all_valid) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); + } + + for (int i = 0; i < 15; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper( + r, r + num_rows, most_valid) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}); + } + + for (int i = 0; i < 15; ++i) { + cols.push_back(*cudf::test::fixed_point_column_wrapper( + r, r + num_rows, all_valid, numeric::scale_type{-2}) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32}); + } + + for (int i = 0; i < 15; ++i) { + cols.push_back(*cudf::test::fixed_point_column_wrapper( + r, r + num_rows, most_valid, numeric::scale_type{-1}) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64}); + } + + std::vector views(cols.begin(), cols.end()); + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Non2Power) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + constexpr auto num_rows = 6 * 1024 + 557; + for (int i = 0; i < 131; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Big) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 28 columns of 1 million rows + constexpr auto num_rows = 1024 * 1024; + for (int i = 0; i < 28; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Bigger) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 28 columns of 1 million rows + constexpr auto num_rows = 1024 * 1024; + for (int i = 0; i < 128; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Biggest) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 128 columns of 1 million rows + constexpr auto num_rows = 5 * 1024 * 1024; + for (int i = 0; i < 128; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, SimpleString) +{ + cudf::test::fixed_width_column_wrapper a({-1, 0, 1, 0, -1}); + cudf::test::strings_column_wrapper b( + {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"}); + cudf::table_view in(std::vector{a, b}); + std::vector schema = {cudf::data_type{cudf::type_id::INT32}, + cudf::data_type{cudf::type_id::STRING}}; + + auto new_rows = cudf::convert_to_rows(in); + EXPECT_EQ(new_rows.size(), 1); + for (auto& row : new_rows) { + auto new_cols = cudf::convert_from_rows(cudf::lists_column_view(*row), schema); + + EXPECT_EQ(row->size(), 5); + auto const num_columns = new_cols->num_columns(); + + cudf::strings_column_view str_col = new_cols->get_column(1).view(); + std::vector> col_data; + std::vector> offset_data; + for (int i = 0; i < num_columns; ++i) { + offset_data.emplace_back( + std::get<0>(cudf::test::to_host(str_col.offsets()))); + col_data.emplace_back(std::get<0>(cudf::test::to_host(str_col.chars()))); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols); + } +} + +TEST_F(RowToColumnTests, DoubleString) +{ + cudf::test::strings_column_wrapper a( + {"hello", "world", "this is a really long string to generate a longer row", "dlrow", "olleh"}); + cudf::test::fixed_width_column_wrapper b({0, 1, 2, 3, 4}); + cudf::test::strings_column_wrapper c({"world", + "hello", + "this string isn't as long", + "this one isn't so short though when you think about it", + "dlrow"}); + cudf::table_view in(std::vector{a, b, c}); + std::vector schema = {cudf::data_type{cudf::type_id::STRING}, + cudf::data_type{cudf::type_id::INT32}, + cudf::data_type{cudf::type_id::STRING}}; + + auto new_rows = cudf::convert_to_rows(in); + + for (uint i = 0; i < new_rows.size(); ++i) { + auto new_cols = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + + EXPECT_EQ(new_rows[0]->size(), 5); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols); + } +} + +TEST_F(RowToColumnTests, BigStrings) +{ + char const* TEST_STRINGS[] = { + "These", + "are", + "the", + "test", + "strings", + "that", + "we", + "have", + "some are really long", + "and some are kinda short", + "They are all over on purpose with different sizes for the strings in order to test the code " + "on all different lengths of strings", + "a", + "good test", + "is required to produce reasonable confidence that this is working"}; + auto num_generator = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + auto string_generator = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* { + return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))]; + }); + + auto const num_rows = 50; + auto const num_cols = 50; + std::vector schema; + + std::vector cols; + std::vector views; + + for (auto col = 0; col < num_cols; ++col) { + if (rand() % 2) { + cols.emplace_back( + cudf::test::fixed_width_column_wrapper(num_generator, num_generator + num_rows)); + views.push_back(cols.back()); + schema.emplace_back(cudf::data_type{cudf::type_id::INT32}); + } else { + cols.emplace_back( + cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows)); + views.push_back(cols.back()); + schema.emplace_back(cudf::type_id::STRING); + } + } + + cudf::table_view in(views); + auto new_rows = cudf::convert_to_rows(in); + + for (auto& i : new_rows) { + auto new_cols = cudf::convert_from_rows(cudf::lists_column_view(*i), schema); + + auto in_view = cudf::slice(in, {0, new_cols->num_rows()}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in_view[0], *new_cols); + } +} + +TEST_F(RowToColumnTests, ManyStrings) +{ + char const* TEST_STRINGS[] = { + "These", + "are", + "the", + "test", + "strings", + "that", + "we", + "have", + "some are really long", + "and some are kinda short", + "They are all over on purpose with different sizes for the strings in order to test the code " + "on all different lengths of strings", + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "this string is the longest string because it is duplicated more than you can imagine " + "a", + "good test", + "is required to produce reasonable confidence that this is working", + "some strings", + "are split into multiple strings", + "some strings have all their data", + "lots of choices of strings and sizes is sure to test the offset calculation code to ensure " + "that even a really long string ends up in the correct spot for the final destination allowing " + "for even crazy run-on sentences to be inserted into the data"}; + auto num_generator = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + auto string_generator = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) -> char const* { + return TEST_STRINGS[rand() % (sizeof(TEST_STRINGS) / sizeof(TEST_STRINGS[0]))]; + }); + + auto const num_rows = 500000; + auto const num_cols = 50; + std::vector schema; + + std::vector cols; + std::vector views; + + for (auto col = 0; col < num_cols; ++col) { + if (rand() % 2) { + cols.emplace_back( + cudf::test::fixed_width_column_wrapper(num_generator, num_generator + num_rows)); + views.push_back(cols.back()); + schema.emplace_back(cudf::data_type{cudf::type_id::INT32}); + } else { + cols.emplace_back( + cudf::test::strings_column_wrapper(string_generator, string_generator + num_rows)); + views.push_back(cols.back()); + schema.emplace_back(cudf::type_id::STRING); + } + } + + cudf::table_view in(views); + auto new_rows = cudf::convert_to_rows(in); + + for (auto& i : new_rows) { + auto new_cols = cudf::convert_from_rows(cudf::lists_column_view(*i), schema); + + auto in_view = cudf::slice(in, {0, new_cols->num_rows()}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in_view[0], *new_cols); + } +} diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 3bd1e3f25a7..50ea54ddaab 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -751,14 +751,6 @@ private static native long[] scatterScalars(long[] srcScalarHandles, long scatte long targetTableHandle) throws CudfException; - private static native long[] convertToRows(long nativeHandle); - - private static native long[] convertToRowsFixedWidthOptimized(long nativeHandle); - - private static native long[] convertFromRows(long nativeColumnView, int[] types, int[] scale); - - private static native long[] convertFromRowsFixedWidthOptimized(long nativeColumnView, int[] types, int[] scale); - private static native long[] repeatStaticCount(long tableHandle, int count); private static native long[] repeatColumnCount(long tableHandle, @@ -3647,140 +3639,6 @@ public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKe return buildSemiJoinGatherMap(gatherMapData); } - /** - * For details about how this method functions refer to - * {@link #convertToRowsFixedWidthOptimized()}. - * - * The only thing different between this method and {@link #convertToRowsFixedWidthOptimized()} - * is that this can handle roughly 250M columns while {@link #convertToRowsFixedWidthOptimized()} - * can only handle columns less than 100 - */ - public ColumnVector[] convertToRows() { - long[] ptrs = convertToRows(nativeHandle); - return ColumnVector.getColumnVectorsFromPointers(ptrs); - } - - /** - * Convert this table of columns into a row major format that is useful for interacting with other - * systems that do row major processing of the data. Currently only fixed-width column types are - * supported. - *

- * The output is one or more ColumnVectors that are lists of bytes. A ColumnVector that is a - * list of bytes can have at most 2GB of data stored in it. Multiple ColumnVectors are returned - * if not all of the data can fit in a single one. - *

- * Each row in the returned ColumnVector array corresponds to a row in the input table. The rows - * will be in the same order as the input Table. The first ColumnVector in the array will hold - * the first N rows followed by the second ColumnVector and so on. The following illustrates - * this and also shows some of the internal structure that will be explained later. - *

-   * result[0]:
-   *  | row 0 | validity for row 0 | padding |
-   *  ...
-   *  | row N | validity for row N | padding |
-   *  result[1]:
-   *  |row N+1 | validity for row N+1 | padding |
-   *  ...
-   * 
- * - * The format of each row is similar in layout to a C struct where each column will have padding - * in front of it to align it properly. Each row has padding inserted at the end so the next row - * is aligned to a 64-bit boundary. This is so that the first column will always start at the - * beginning (first byte) of the list of bytes and each row has a consistent layout for fixed - * width types. - *

- * Validity bytes are added to the end of the row. There will be one byte for each 8 columns in a - * row. Because the validity is byte aligned there is no padding between it and the last column - * in the row. - *

- * For example a table consisting of the following columns A, B, C with the corresponding types - *

-   *   | A - BOOL8 (8-bit) | B - INT16 (16-bit) | C - DURATION_DAYS (32-bit) |
-   * 
- *

- * Will have a layout that looks like - *

-   *  | A_0 | P | B_0 | B_1 | C_0 | C_1 | C_2 | C_3 | V0 | P | P | P | P | P | P | P |
-   * 
- *

- * In this P corresponds to a byte of padding, [LETTER]_[NUMBER] represents the NUMBER - * byte of the corresponding LETTER column, and V[NUMBER] is a validity byte for the `NUMBER * 8` - * to `(NUMBER + 1) * 8` columns. - *

- * The order of the columns will not be changed, but to reduce the total amount of padding it is - * recommended to order the columns in the following way. - *

- *

    - *
  1. 64-bit columns
  2. - *
  3. 32-bit columns
  4. - *
  5. 16-bit columns
  6. - *
  7. 8-bit columns
  8. - *
- *

- * This way padding is only inserted at the end of a row to make the next column 64-bit aligned. - * So for the example above if the columns were ordered C, B, A the layout would be. - *

-   * | C_0 | C_1 | C_2 | C_3 | B_0 | B_1 | A_0 | V0 |
-   * 
- * This would have reduced the overall size of the data transferred by half. - *

- * One of the main motivations for doing a row conversion on the GPU is to avoid cache problems - * when walking through columnar data on the CPU in a row wise manner. If you are not transferring - * very many columns it is likely to be more efficient to just pull back the columns and walk - * through them. This is especially true of a single column of fixed width data. The extra - * padding will slow down the transfer and looking at only a handful of buffers is not likely to - * cause cache issues. - *

- * There are some limits on the size of a single row. If the row is larger than 1KB this will - * throw an exception. - */ - public ColumnVector[] convertToRowsFixedWidthOptimized() { - long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle); - return ColumnVector.getColumnVectorsFromPointers(ptrs); - } - - /** - * Convert a column of list of bytes that is formatted like the output from `convertToRows` - * and convert it back to a table. - * - * NOTE: This method doesn't support nested types - * - * @param vec the row data to process. - * @param schema the types of each column. - * @return the parsed table. - */ - public static Table convertFromRows(ColumnView vec, DType ... schema) { - int[] types = new int[schema.length]; - int[] scale = new int[schema.length]; - for (int i = 0; i < schema.length; i++) { - types[i] = schema[i].typeId.nativeId; - scale[i] = schema[i].getScale(); - - } - return new Table(convertFromRows(vec.getNativeView(), types, scale)); - } - - /** - * Convert a column of list of bytes that is formatted like the output from `convertToRows` - * and convert it back to a table. - * - * NOTE: This method doesn't support nested types - * - * @param vec the row data to process. - * @param schema the types of each column. - * @return the parsed table. - */ - public static Table convertFromRowsFixedWidthOptimized(ColumnView vec, DType ... schema) { - int[] types = new int[schema.length]; - int[] scale = new int[schema.length]; - for (int i = 0; i < schema.length; i++) { - types[i] = schema[i].typeId.nativeId; - scale[i] = schema[i].getScale(); - - } - return new Table(convertFromRowsFixedWidthOptimized(vec.getNativeView(), types, scale)); - } - /** * Construct a table from a packed representation. * @param metadata host-based metadata for the table diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 260b402443a..1853cbd6a5c 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -147,7 +147,6 @@ add_library( src/TableJni.cpp src/aggregation128_utils.cu src/maps_column_view.cu - src/row_conversion.cu src/check_nvcomp_output_sizes.cu ) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index fad19bdf895..d7d0279174d 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -54,7 +54,6 @@ #include "jni_compiled_expr.hpp" #include "jni_utils.hpp" #include "jni_writer_data_sink.hpp" -#include "row_conversion.hpp" namespace cudf { namespace jni { @@ -3207,24 +3206,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL -Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, jlong input_table) { - JNI_NULL_CHECK(env, input_table, "input table is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const n_input_table = reinterpret_cast(input_table); - std::vector> cols = - cudf::jni::convert_to_rows_fixed_width_optimized(*n_input_table); - int num_columns = cols.size(); - cudf::jni::native_jlongArray outcol_handles(env, num_columns); - std::transform(cols.begin(), cols.end(), outcol_handles.begin(), - [](auto &col) { return release_as_jlong(col); }); - return outcol_handles.get_jArray(); - } - CATCH_STD(env, 0); -} - JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env, jclass, jlong j_input, jlong j_map, jlong j_target) { @@ -3260,72 +3241,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *en CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env, jclass, - jlong input_table) { - JNI_NULL_CHECK(env, input_table, "input table is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const n_input_table = reinterpret_cast(input_table); - std::vector> cols = cudf::jni::convert_to_rows(*n_input_table); - int num_columns = cols.size(); - cudf::jni::native_jlongArray outcol_handles(env, num_columns); - std::transform(cols.begin(), cols.end(), outcol_handles.begin(), - [](auto &col) { return release_as_jlong(col); }); - return outcol_handles.get_jArray(); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized( - JNIEnv *env, jclass, jlong input_column, jintArray types, jintArray scale) { - JNI_NULL_CHECK(env, input_column, "input column is null", 0); - JNI_NULL_CHECK(env, types, "types is null", 0); - - try { - cudf::jni::auto_set_device(env); - cudf::lists_column_view const list_input{*reinterpret_cast(input_column)}; - cudf::jni::native_jintArray n_types(env, types); - cudf::jni::native_jintArray n_scale(env, scale); - if (n_types.size() != n_scale.size()) { - JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size", - NULL); - } - std::vector types_vec; - std::transform(n_types.begin(), n_types.end(), n_scale.begin(), std::back_inserter(types_vec), - [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); }); - std::unique_ptr result = - cudf::jni::convert_from_rows_fixed_width_optimized(list_input, types_vec); - return convert_table_for_return(env, result); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *env, jclass, - jlong input_column, - jintArray types, - jintArray scale) { - JNI_NULL_CHECK(env, input_column, "input column is null", 0); - JNI_NULL_CHECK(env, types, "types is null", 0); - - try { - cudf::jni::auto_set_device(env); - cudf::lists_column_view const list_input{*reinterpret_cast(input_column)}; - cudf::jni::native_jintArray n_types(env, types); - cudf::jni::native_jintArray n_scale(env, scale); - if (n_types.size() != n_scale.size()) { - JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size", - NULL); - } - std::vector types_vec; - std::transform(n_types.begin(), n_types.end(), n_scale.begin(), std::back_inserter(types_vec), - [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); }); - std::unique_ptr result = cudf::jni::convert_from_rows(list_input, types_vec); - return convert_table_for_return(env, result); - } - CATCH_STD(env, 0); -} - JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatStaticCount(JNIEnv *env, jclass, jlong input_jtable, jint count) { diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp deleted file mode 100644 index e4631875152..00000000000 --- a/java/src/main/native/src/row_conversion.hpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include -#include - -namespace cudf { -namespace jni { - -std::vector> convert_to_rows_fixed_width_optimized( - cudf::table_view const &tbl, - // TODO need something for validity - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - -std::vector> -convert_to_rows(cudf::table_view const &tbl, - // TODO need something for validity - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows_fixed_width_optimized( - cudf::lists_column_view const &input, std::vector const &schema, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr -convert_from_rows(cudf::lists_column_view const &input, std::vector const &schema, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - -} // namespace jni -} // namespace cudf diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index b0dd4122b0e..8df8ebea8a7 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -9076,73 +9076,6 @@ void testStructColumnFilterStrings() { } } - @Test - void fixedWidthRowsRoundTripWide() { - TestBuilder tb = new TestBuilder(); - IntStream.range(0, 10).forEach(i -> tb.column(3l, 9l, 4l, 2l, 20l, null)); - IntStream.range(0, 10).forEach(i -> tb.column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null)); - IntStream.range(0, 10).forEach(i -> tb.column(5, 1, 0, 2, 7, null)); - IntStream.range(0, 10).forEach(i -> tb.column(true, false, false, true, false, null)); - IntStream.range(0, 10).forEach(i -> tb.column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f, null)); - IntStream.range(0, 10).forEach(i -> tb.column(new Byte[]{2, 3, 4, 5, 9, null})); - IntStream.range(0, 10).forEach(i -> tb.decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, - 9.5d, 0.9d, 7.23d, 2.8d, null)); - IntStream.range(0, 10).forEach(i -> tb.decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null)); - try (Table origTable = tb.build()) { - ColumnVector[] rowMajorTable = origTable.convertToRows(); - try { - // We didn't overflow - assert rowMajorTable.length == 1; - ColumnVector cv = rowMajorTable[0]; - assert cv.getRowCount() == origTable.getRowCount(); - DType[] types = new DType[origTable.getNumberOfColumns()]; - for (int i = 0; i < origTable.getNumberOfColumns(); i++) { - types[i] = origTable.getColumn(i).getType(); - } - try (Table backAgain = Table.convertFromRows(cv, types)) { - assertTablesAreEqual(origTable, backAgain); - } - } finally { - for (ColumnVector cv : rowMajorTable) { - cv.close(); - } - } - } - } - - @Test - void fixedWidthRowsRoundTrip() { - try (Table origTable = new TestBuilder() - .column(3l, 9l, 4l, 2l, 20l, null) - .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null) - .column(5, 1, 0, 2, 7, null) - .column(true, false, false, true, false, null) - .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f, null) - .column(new Byte[]{2, 3, 4, 5, 9, null}) - .decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null) - .decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null) - .build()) { - ColumnVector[] rowMajorTable = origTable.convertToRowsFixedWidthOptimized(); - try { - // We didn't overflow - assert rowMajorTable.length == 1; - ColumnVector cv = rowMajorTable[0]; - assert cv.getRowCount() == origTable.getRowCount(); - DType[] types = new DType[origTable.getNumberOfColumns()]; - for (int i = 0; i < origTable.getNumberOfColumns(); i++) { - types[i] = origTable.getColumn(i).getType(); - } - try (Table backAgain = Table.convertFromRowsFixedWidthOptimized(cv, types)) { - assertTablesAreEqual(origTable, backAgain); - } - } finally { - for (ColumnVector cv : rowMajorTable) { - cv.close(); - } - } - } - } - // utility methods to reduce typing private static StructData struct(Object... values) {