diff --git a/CHANGELOG.md b/CHANGELOG.md index b650663d64e..f3b69963218 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -72,6 +72,7 @@ - PR #6597 Use thread-local to track CUDA device in JNI - PR #6599 Replace `size()==0` with `empty()`, `is_empty()` - PR #6514 Initial work for decimal type in Java/JNI +- PR #6605 Reduce HtoD copies in `cudf::concatenate` of string columns - PR #6608 Improve subword tokenizer docs - PR #6610 Add ability to set scalar values in `cudf.DataFrame` - PR #6612 Update JNI to new RMM cuda_stream_view API diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index d913a414ad7..69517aa8d27 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -15,7 +15,9 @@ */ #pragma once +#include #include +#include #include #include #include @@ -875,5 +877,60 @@ struct mutable_value_accessor { __device__ T& operator()(cudf::size_type i) { return col.element(i); } }; +/** + * @brief Helper function for use by column_device_view and mutable_column_device_view constructors + * to build device_views from views. + * + * It is used to build the array of child columns in device memory. Since child columns can + * also have child columns, this uses recursion to build up the flat device buffer to contain + * all the children and set the member pointers appropriately. + * + * This is accomplished by laying out all the children and grand-children into a flat host + * buffer first but also keep a running device pointer to use when setting the + * d_children array result. + * + * This function is provided both the host pointer in which to insert its children (and + * by recursion its grand-children) and the device pointer to be used when calculating + * ultimate device pointer for the d_children member. + * + * @tparam ColumnView is either column_view or mutable_column_view + * @tparam ColumnDeviceView is either column_device_view or mutable_column_device_view + * + * @param child_begin Iterator pointing to begin of child columns to make into a device view + * @param child_begin Iterator pointing to end of child columns to make into a device view + * @param h_ptr The host memory where to place any child data + * @param d_ptr The device pointer for calculating the d_children member of any child data + * @return The device pointer to be used for the d_children member of the given column + */ +template +ColumnDeviceView* child_columns_to_device_array(ColumnViewIterator child_begin, + ColumnViewIterator child_end, + void* h_ptr, + void* d_ptr) +{ + ColumnDeviceView* d_children = detail::align_ptr_for_type(d_ptr); + auto num_children = std::distance(child_begin, child_end); + if (num_children > 0) { + // The beginning of the memory must be the fixed-sized ColumnDeviceView + // struct objects in order for d_children to be used as an array. + auto h_column = detail::align_ptr_for_type(h_ptr); + auto d_column = d_children; + + // Any child data is assigned past the end of this array: h_end and d_end. + auto h_end = reinterpret_cast(h_column + num_children); + auto d_end = reinterpret_cast(d_column + num_children); + std::for_each(child_begin, child_end, [&](auto const& col) { + // inplace-new each child into host memory + new (h_column) ColumnDeviceView(col, h_end, d_end); + h_column++; // advance to next child + // update the pointers for holding this child column's child data + auto col_child_data_size = ColumnDeviceView::extent(col) - sizeof(ColumnDeviceView); + h_end += col_child_data_size; + d_end += col_child_data_size; + }); + } + return d_children; +} + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 0a3f7881b2c..e491cc30c90 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -535,6 +535,16 @@ class mutable_column_view : public detail::column_view_base { **/ size_type num_children() const noexcept { return mutable_children.size(); } + /** + * @brief Returns iterator to the beginning of the ordered sequence of child column-views. + */ + auto child_begin() const noexcept { return mutable_children.begin(); } + + /** + * @brief Returns iterator to the end of the ordered sequence of child column-views. + */ + auto child_end() const noexcept { return mutable_children.end(); } + /** * @brief Converts a mutable view into an immutable view * diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index 33c61414a1c..141caa1fc8c 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -174,5 +174,23 @@ void device_single_thread(Functor functor, rmm::cuda_stream_view stream = rmm::c single_thread_kernel<<<1, 1, 0, stream.value()>>>(functor); } +/** + * @brief Returns the aligned address for holding array of type T in pre-allocated memory + * @param destination pointer to pre-allocated contiguous storage to store type T. + * @return Pointer of type T, aligned to alignment of type T. + */ +template +T* align_ptr_for_type(void* destination) +{ + constexpr std::size_t bytes_needed{sizeof(T)}; + constexpr std::size_t alignment{alignof(T)}; + + // pad the allocation for aligning the first pointer + auto padded_bytes_needed = bytes_needed + (alignment - 1); + // std::align captures last argument by reference and modifies it, but we don't want it modified + return reinterpret_cast( + std::align(alignment, bytes_needed, destination, padded_bytes_needed)); +} + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh index f34a265a50a..e2263e4f5df 100644 --- a/cpp/include/cudf/table/table_device_view.cuh +++ b/cpp/include/cudf/table/table_device_view.cuh @@ -23,6 +23,7 @@ #include #include +#include /** * @file table_device_view.cuh @@ -112,4 +113,40 @@ class mutable_table_device_view { } }; + +template +auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_stream_view stream) +{ + // First calculate the size of memory needed to hold the + // table's ColumnDeviceViews. This is done by calling extent() + // for each of the table's ColumnViews columns. + std::size_t views_size_bytes = std::accumulate( + source_view.begin(), source_view.end(), std::size_t{0}, [](std::size_t init, auto col) { + return init + ColumnDeviceView::extent(col); + }); + // pad the allocation for aligning the first pointer + auto padded_views_size_bytes = views_size_bytes + std::size_t{alignof(ColumnDeviceView) - 1}; + // A buffer of CPU memory is allocated to hold the ColumnDeviceView + // objects. Once filled, the CPU memory is then copied to device memory + // and the pointer is set in the d_columns member. + std::vector h_buffer(padded_views_size_bytes); + // Each ColumnDeviceView instance may have child objects which may + // require setting some internal device pointers before being copied + // from CPU to device. + // Allocate the device memory to be used in the result. + // We need this pointer in order to pass it down when creating the + // ColumnDeviceViews so the column can set the pointer(s) for any + // of its child objects. + // align both h_ptr, d_ptr + auto descendant_storage = std::make_unique(padded_views_size_bytes, stream); + void* h_ptr = detail::align_ptr_for_type(h_buffer.data()); + void* d_ptr = detail::align_ptr_for_type(descendant_storage->data()); + auto d_columns = detail::child_columns_to_device_array( + source_view.begin(), source_view.end(), h_ptr, d_ptr); + + CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value())); + stream.synchronize(); + return std::make_tuple(std::move(descendant_storage), d_columns); +} + } // namespace cudf diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index fb54c9b0bcc..283c4b42d92 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -39,58 +39,6 @@ column_device_view::column_device_view(column_view source) void column_device_view::destroy() { delete this; } namespace { -/** - * @brief Helper function for use by column_device_view and mutable_column_device_view constructors - * to build device_views from views. - * - * It is used to build the array of child columns in device memory. Since child columns can - * also have child columns, this uses recursion to build up the flat device buffer to contain - * all the children and set the member pointers appropriately. - * - * This is accomplished by laying out all the children and grand-children into a flat host - * buffer first but also keep a running device pointer to but used when setting the - * d_children array result. - * - * This function is provided both the host pointer in which to insert its children (and - * by recursion its grand-children) and the device pointer to be used when calculating - * ultimate device pointer for the d_children member. - * - * @tparam ColumnView is either column_view or mutable_column_view - * @tparam ColumnDeviceView is either column_device_view or mutable_column_device_view - * - * @param source The column view to make into a device view - * @param h_ptr The host memory where to place any child data - * @param d_ptr The device pointer for calculating the d_children member of any child data - * @return The device pointer to be used for the d_children member of the given column - */ -template -ColumnDeviceView* child_columns_to_device_array(ColumnView const& source, void* h_ptr, void* d_ptr) -{ - ColumnDeviceView* d_children = nullptr; - size_type num_children = source.num_children(); - if (num_children > 0) { - // The beginning of the memory must be the fixed-sized ColumnDeviceView - // struct objects in order for d_children to be used as an array. - auto h_column = reinterpret_cast(h_ptr); - auto d_column = reinterpret_cast(d_ptr); - // Any child data is assigned past the end of this array: h_end and d_end. - auto h_end = reinterpret_cast(h_column + num_children); - auto d_end = reinterpret_cast(d_column + num_children); - d_children = d_column; // set children pointer for return - for (size_type idx = 0; idx < num_children; ++idx) { - // inplace-new each child into host memory - auto child = source.child(idx); - new (h_column) ColumnDeviceView(child, h_end, d_end); - h_column++; // advance to next child - // update the pointers for holding this child column's child data - auto col_child_data_size = ColumnDeviceView::extent(child) - sizeof(ColumnDeviceView); - h_end += col_child_data_size; - d_end += col_child_data_size; - } - } - return d_children; -} - // helper function for column_device_view::create and mutable_column_device::create methods template std::unique_ptr> @@ -104,8 +52,9 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str thrust::make_counting_iterator(0), [&source](auto i) { return ColumnDeviceView::extent(source.child(i)); }); - auto const descendant_storage_bytes = - std::accumulate(get_extent, get_extent + num_children, std::size_t{0}); + // pad the allocation for aligning the first pointer + auto const descendant_storage_bytes = std::accumulate( + get_extent, get_extent + num_children, std::size_t{alignof(ColumnDeviceView) - 1}); // A buffer of CPU memory is allocated to hold the ColumnDeviceView // objects. Once filled, the CPU memory is copied to device memory @@ -150,7 +99,8 @@ column_device_view::column_device_view(column_view source, void* h_ptr, void* d_ source.offset()}, _num_children{source.num_children()} { - d_children = child_columns_to_device_array(source, h_ptr, d_ptr); + d_children = detail::child_columns_to_device_array( + source.child_begin(), source.child_end(), h_ptr, d_ptr); } // Construct a unique_ptr that invokes `destroy()` as it's deleter @@ -196,8 +146,8 @@ mutable_column_device_view::mutable_column_device_view(mutable_column_view sourc source.offset()}, _num_children{source.num_children()} { - d_children = child_columns_to_device_array( - source, h_ptr, d_ptr); + d_children = detail::child_columns_to_device_array( + source.child_begin(), source.child_end(), h_ptr, d_ptr); } // Handle freeing children diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index ffa6eb9a076..b4eab65d2ad 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -66,34 +67,22 @@ struct chars_size_transform { auto create_strings_device_views(std::vector const& views, rmm::cuda_stream_view stream) { - // Create device views for each input view - using CDViewPtr = decltype( - column_device_view::create(std::declval(), std::declval())); - auto device_view_owners = std::vector(views.size()); - std::transform( - views.cbegin(), views.cend(), device_view_owners.begin(), [stream](auto const& col) { - return column_device_view::create(col, stream); - }); - + CUDF_FUNC_RANGE(); // Assemble contiguous array of device views - auto device_views = thrust::host_vector(); - device_views.reserve(views.size()); - std::transform(device_view_owners.cbegin(), - device_view_owners.cend(), - std::back_inserter(device_views), - [](auto const& col) { return *col; }); - auto d_views = rmm::device_vector{device_views}; + std::unique_ptr device_view_owners; + column_device_view* device_views_ptr; + std::tie(device_view_owners, device_views_ptr) = + contiguous_copy_column_device_views(views, stream); // Compute the partition offsets and size of offset column // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type auto input_offsets = thrust::host_vector(views.size() + 1); - thrust::transform_inclusive_scan( - thrust::host, - device_views.cbegin(), - device_views.cend(), - std::next(input_offsets.begin()), - [](auto const& col) { return static_cast(col.size()); }, - thrust::plus{}); + auto offset_it = std::next(input_offsets.begin()); + thrust::transform( + thrust::host, views.cbegin(), views.cend(), offset_it, [](auto const& col) -> size_t { + return static_cast(col.size()); + }); + thrust::inclusive_scan(thrust::host, offset_it, input_offsets.end(), offset_it); auto const d_input_offsets = rmm::device_vector{input_offsets}; auto const output_size = input_offsets.back(); @@ -105,8 +94,8 @@ auto create_strings_device_views(std::vector const& views, // referenced -- it is a deleted function auto d_partition_offsets = rmm::device_vector(views.size() + 1); thrust::transform(rmm::exec_policy(stream)->on(stream.value()), - d_views.cbegin(), - d_views.cend(), + device_views_ptr, + device_views_ptr + views.size(), std::next(d_partition_offsets.begin()), chars_size_transform{}); thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()), @@ -116,7 +105,7 @@ auto create_strings_device_views(std::vector const& views, auto const output_chars_size = d_partition_offsets.back(); return std::make_tuple(std::move(device_view_owners), - std::move(d_views), + device_views_ptr, std::move(d_input_offsets), std::move(d_partition_offsets), output_size, @@ -219,6 +208,7 @@ std::unique_ptr concatenate(std::vector const& columns, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_FUNC_RANGE(); // Compute output sizes auto const device_views = create_strings_device_views(columns, stream); auto const& d_views = std::get<1>(device_views); @@ -265,10 +255,10 @@ std::unique_ptr concatenate(std::vector const& columns, auto const kernel = has_nulls ? fused_concatenate_string_offset_kernel : fused_concatenate_string_offset_kernel; kernel<<>>( - d_views.data().get(), + d_views, d_input_offsets.data().get(), d_partition_offsets.data().get(), - static_cast(d_views.size()), + static_cast(columns.size()), strings_count, d_new_offsets, reinterpret_cast(null_mask.data()), @@ -285,9 +275,9 @@ std::unique_ptr concatenate(std::vector const& columns, cudf::detail::grid_1d config(total_bytes, block_size); auto const kernel = fused_concatenate_string_chars_kernel; kernel<<>>( - d_views.data().get(), + d_views, d_partition_offsets.data().get(), - static_cast(d_views.size()), + static_cast(columns.size()), total_bytes, d_new_chars); } else { diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu index a2cb69044ed..bdce1c325c5 100644 --- a/cpp/src/table/table_device_view.cu +++ b/cpp/src/table/table_device_view.cu @@ -21,10 +21,6 @@ #include -#include -#include -#include - namespace cudf { namespace detail { template @@ -43,50 +39,10 @@ table_device_view_base::table_device_view_base( // objects and copied into device memory for the table_device_view's // _columns member. if (source_view.num_columns() > 0) { - // - // First calculate the size of memory needed to hold the - // table's ColumnDeviceViews. This is done by calling extent() - // for each of the table's ColumnViews columns. - std::size_t views_size_bytes = - std::accumulate(source_view.begin(), source_view.end(), 0, [](std::size_t init, auto col) { - return init + ColumnDeviceView::extent(col); - }); - // A buffer of CPU memory is allocated to hold the ColumnDeviceView - // objects. Once filled, the CPU memory is then copied to device memory - // and the pointer is set in the _columns member. - std::vector h_buffer(views_size_bytes); - ColumnDeviceView* h_column = reinterpret_cast(h_buffer.data()); - // Each ColumnDeviceView instance may have child objects which may - // require setting some internal device pointers before being copied - // from CPU to device. - // Allocate the device memory to be used in the result. - // We need this pointer in order to pass it down when creating the - // ColumnDeviceViews so the column can set the pointer(s) for any - // of its child objects. - _descendant_storage = new rmm::device_buffer(views_size_bytes, stream); - _columns = reinterpret_cast(_descendant_storage->data()); - // The beginning of the memory must be the fixed-sized ColumnDeviceView - // objects in order for _columns to be used as an array. Therefore, - // any child data is assigned to the end of this array (h_end/d_end). - auto h_end = (int8_t*)(h_column + source_view.num_columns()); - auto d_end = (int8_t*)(_columns + source_view.num_columns()); - // Create the ColumnDeviceView from each column within the CPU memory - // Any column child data should be copied into h_end and any - // internal pointers should be set using d_end. - for (auto itr = source_view.begin(); itr != source_view.end(); ++itr) { - auto col = *itr; - // convert the ColumnView into ColumnDeviceView - new (h_column) ColumnDeviceView(col, h_end, d_end); - h_column++; // point to memory slot for the next ColumnDeviceView - // update the pointers for holding ColumnDeviceView's child data - auto col_child_data_size = (ColumnDeviceView::extent(col) - sizeof(ColumnDeviceView)); - h_end += col_child_data_size; - d_end += col_child_data_size; - } - - CUDA_TRY(cudaMemcpyAsync( - _columns, h_buffer.data(), views_size_bytes, cudaMemcpyDefault, stream.value())); - stream.synchronize(); + std::unique_ptr descendant_storage_owner; + std::tie(descendant_storage_owner, _columns) = + contiguous_copy_column_device_views(source_view, stream); + _descendant_storage = descendant_storage_owner.release(); } } diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu index 96338ca35b1..f89d6b8cd42 100644 --- a/cpp/tests/copying/concatenate_tests.cu +++ b/cpp/tests/copying/concatenate_tests.cu @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -147,6 +148,42 @@ TEST_F(StringColumnTest, ConcatenateColumnView) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringColumnTest, ConcatenateTooManyColumns) +{ + std::vector h_strings{"aaa", + "bb", + "", + "cccc", + "d", + "ééé", + "ff", + "gggg", + "", + "h", + "iiii", + "jjj", + "k", + "lllllll", + "mmmmm", + "n", + "oo", + "ppp"}; + + std::vector expected_strings; + std::vector wrappers; + std::vector strings_columns; + std::string expected_string; + for (int i = 0; i < 200; ++i) { + wrappers.emplace_back(h_strings.data(), h_strings.data() + h_strings.size()); + strings_columns.push_back(wrappers[i]); + expected_strings.insert(expected_strings.end(), h_strings.begin(), h_strings.end()); + } + cudf::test::strings_column_wrapper expected(expected_strings.data(), + expected_strings.data() + expected_strings.size()); + auto results = cudf::concatenate(strings_columns); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + struct TableTest : public cudf::test::BaseFixture { };