Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reduce HtoD copies in cudf::concatenate #6605 #6605

Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
cd3fbf9
add create_contiguous_device_views (with profiling)
karthikeyann Oct 27, 2020
3a9ca3a
fix include, use cudf::thread_range
karthikeyann Oct 28, 2020
f30f8df
changelog entry for PR #6605
karthikeyann Oct 28, 2020
705fcbe
add list_of_column_device_views to reuse table_device_view_base code
karthikeyann Oct 28, 2020
2ca290e
share the template code for contiguous column_device_view copy
karthikeyann Oct 29, 2020
9c4dfab
typo fix
karthikeyann Oct 29, 2020
de44eee
Merge branch 'branch-0.17' of github.com:rapidsai/cudf into enh-strin…
karthikeyann Oct 29, 2020
c2e700a
Merge branch 'branch-0.17' of github.com:rapidsai/cudf into enh-strin…
karthikeyann Nov 5, 2020
088c4c1
align device pointer for children columns copy to device memory
karthikeyann Nov 10, 2020
117bdbd
Merge branch 'branch-0.17' of github.com:rapidsai/cudf into enh-strin…
karthikeyann Nov 10, 2020
a4f58ff
move alias_ptr_for_type
karthikeyann Nov 19, 2020
aee16b9
Merge branch 'branch-0.17' of github.com:rapidsai/cudf into enh-strin…
karthikeyann Nov 19, 2020
bdedebb
Apply suggestions from code review (harrism)
karthikeyann Nov 20, 2020
40cee88
stylefix, remove profile code
karthikeyann Nov 20, 2020
9ba2b6f
review comments
karthikeyann Nov 23, 2020
4480956
Merge branch 'branch-0.17' of github.com:rapidsai/cudf into enh-strin…
karthikeyann Nov 23, 2020
eebd99c
align before child columns call
karthikeyann Nov 24, 2020
a87dd4a
typo fix
karthikeyann Nov 24, 2020
0bf94bc
Merge branch 'branch-0.17' into enh-string_concatenate_HtoD_copies
karthikeyann Nov 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
- PR #6597 Use thread-local to track CUDA device in JNI
- PR #6599 Replace `size()==0` with `empty()`, `is_empty()`
- PR #6514 Initial work for decimal type in Java/JNI
- PR #6605 Reduce HtoD copies in `cudf::concatenate` of string columns
- PR #6608 Improve subword tokenizer docs
- PR #6610 Add ability to set scalar values in `cudf.DataFrame`
- PR #6612 Update JNI to new RMM cuda_stream_view API
Expand Down
58 changes: 58 additions & 0 deletions cpp/include/cudf/column/column_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <cudf/column/column_view.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/lists/list_view.cuh>
#include <cudf/strings/string_view.cuh>
Expand Down Expand Up @@ -875,5 +876,62 @@ struct mutable_value_accessor {
__device__ T& operator()(cudf::size_type i) { return col.element<T>(i); }
};

/**
* @brief Helper function for use by column_device_view and mutable_column_device_view constructors
* to build device_views from views.
*
* It is used to build the array of child columns in device memory. Since child columns can
* also have child columns, this uses recursion to build up the flat device buffer to contain
* all the children and set the member pointers appropriately.
*
* This is accomplished by laying out all the children and grand-children into a flat host
* buffer first but also keep a running device pointer to but used when setting the
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
* d_children array result.
*
* This function is provided both the host pointer in which to insert its children (and
* by recursion its grand-children) and the device pointer to be used when calculating
* ultimate device pointer for the d_children member.
*
* @tparam ColumnView is either column_view or mutable_column_view
* @tparam ColumnDeviceView is either column_device_view or mutable_column_device_view
*
* @param child_begin Iterator pointing to begin of child columns to make into a device view
* @param child_begin Iterator pointing to end of child columns to make into a device view
* @param h_ptr The host memory where to place any child data
* @param d_ptr The device pointer for calculating the d_children member of any child data
* @return The device pointer to be used for the d_children member of the given column
*/
template <typename ColumnDeviceView, typename ColumnViewIterator>
ColumnDeviceView* child_columns_to_device_array(ColumnViewIterator child_begin,
ColumnViewIterator child_end,
void* h_ptr,
void* d_ptr)
{
ColumnDeviceView* d_children = nullptr;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
auto num_children = std::distance(child_begin, child_end);
if (num_children > 0) {
// The beginning of the memory must be the fixed-sized ColumnDeviceView
// struct objects in order for d_children to be used as an array.
auto h_column = detail::align_ptr_for_type<ColumnDeviceView>(h_ptr);
auto d_column = detail::align_ptr_for_type<ColumnDeviceView>(d_ptr);

// Any child data is assigned past the end of this array: h_end and d_end.
auto h_end = reinterpret_cast<int8_t*>(h_column + num_children);
auto d_end = reinterpret_cast<int8_t*>(d_column + num_children);
d_children = d_column; // set children pointer for return
for (auto itr = child_begin; itr != child_end; ++itr) {
// inplace-new each child into host memory
auto col = *itr;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
new (h_column) ColumnDeviceView(col, h_end, d_end);
h_column++; // advance to next child
// update the pointers for holding this child column's child data
auto col_child_data_size = ColumnDeviceView::extent(col) - sizeof(ColumnDeviceView);
h_end += col_child_data_size;
d_end += col_child_data_size;
}
}
return d_children;
}

} // namespace detail
} // namespace cudf
10 changes: 10 additions & 0 deletions cpp/include/cudf/column/column_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,16 @@ class mutable_column_view : public detail::column_view_base {
**/
size_type num_children() const noexcept { return mutable_children.size(); }

/**
* @brief Returns iterator to the beginning of the ordered sequence of child column-views.
*/
auto child_begin() const noexcept { return mutable_children.begin(); }

/**
* @brief Returns iterator to the end of the ordered sequence of child column-views.
*/
auto child_end() const noexcept { return mutable_children.end(); }

/**
* @brief Converts a mutable view into an immutable view
*
Expand Down
18 changes: 18 additions & 0 deletions cpp/include/cudf/detail/utilities/cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -173,5 +173,23 @@ void device_single_thread(Functor functor, cudaStream_t stream = 0)
single_thread_kernel<<<1, 1, 0, stream>>>(functor);
}

/**
* @brief Returns the aligned address for holding array of type T in pre-allocated memory
* @param destination pointer to pre-allocated contiguous storage to store type T.
* @return Pointer of type T, aligned to alignment of type T.
*/
template <typename T>
T* align_ptr_for_type(void* destination)
{
constexpr std::size_t bytes_needed{sizeof(T)};
constexpr std::size_t alignment{alignof(T)};

// pad the allocation for aligning the first pointer
auto padded_bytes_needed = bytes_needed + (alignment - 1);
// std::align captures last argument by reference and modifies it, but we don't want it modified
return reinterpret_cast<T*>(
std::align(alignment, bytes_needed, destination, padded_bytes_needed));
}

} // namespace detail
} // namespace cudf
39 changes: 39 additions & 0 deletions cpp/include/cudf/table/table_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include <cassert>
#include <memory>
#include <numeric>

/**
* @file table_device_view.cuh
Expand Down Expand Up @@ -109,4 +110,42 @@ class mutable_table_device_view
{
}
};

template <typename ColumnDeviceView, typename HostTableView>
auto contiguous_copy_column_device_views(HostTableView source_view, cudaStream_t stream)
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
{
//
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
// First calculate the size of memory needed to hold the
// table's ColumnDeviceViews. This is done by calling extent()
// for each of the table's ColumnViews columns.
std::size_t views_size_bytes = std::accumulate(
source_view.begin(), source_view.end(), std::size_t{0}, [](std::size_t init, auto col) {
return init + ColumnDeviceView::extent(col);
});
// pad the allocation for aligning the first pointer
auto padded_views_size_bytes = views_size_bytes + std::size_t{alignof(ColumnDeviceView) - 1};
// A buffer of CPU memory is allocated to hold the ColumnDeviceView
// objects. Once filled, the CPU memory is then copied to device memory
// and the pointer is set in the d_columns member.
std::vector<int8_t> h_buffer(padded_views_size_bytes);
// Each ColumnDeviceView instance may have child objects which may
// require setting some internal device pointers before being copied
// from CPU to device.
// Allocate the device memory to be used in the result.
// We need this pointer in order to pass it down when creating the
// ColumnDeviceViews so the column can set the pointer(s) for any
// of its child objects.
auto _descendant_storage = new rmm::device_buffer(padded_views_size_bytes, stream);
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
void* h_ptr = h_buffer.data();
void* d_ptr = _descendant_storage->data();
auto d_columns = detail::child_columns_to_device_array<ColumnDeviceView>(
source_view.begin(), source_view.end(), h_ptr, d_ptr);

// align h_ptr also, because both h_ptr, d_ptr alignment will not be same!
auto aligned_hptr = detail::align_ptr_for_type<ColumnDeviceView>(h_buffer.data());
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
CUDA_TRY(cudaMemcpyAsync(d_columns, aligned_hptr, views_size_bytes, cudaMemcpyDefault, stream));
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
CUDA_TRY(cudaStreamSynchronize(stream));
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
return std::make_tuple(_descendant_storage, d_columns);
}

} // namespace cudf
64 changes: 7 additions & 57 deletions cpp/src/column/column_device_view.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,58 +39,6 @@ column_device_view::column_device_view(column_view source)
void column_device_view::destroy() { delete this; }

namespace {
/**
* @brief Helper function for use by column_device_view and mutable_column_device_view constructors
* to build device_views from views.
*
* It is used to build the array of child columns in device memory. Since child columns can
* also have child columns, this uses recursion to build up the flat device buffer to contain
* all the children and set the member pointers appropriately.
*
* This is accomplished by laying out all the children and grand-children into a flat host
* buffer first but also keep a running device pointer to but used when setting the
* d_children array result.
*
* This function is provided both the host pointer in which to insert its children (and
* by recursion its grand-children) and the device pointer to be used when calculating
* ultimate device pointer for the d_children member.
*
* @tparam ColumnView is either column_view or mutable_column_view
* @tparam ColumnDeviceView is either column_device_view or mutable_column_device_view
*
* @param source The column view to make into a device view
* @param h_ptr The host memory where to place any child data
* @param d_ptr The device pointer for calculating the d_children member of any child data
* @return The device pointer to be used for the d_children member of the given column
*/
template <typename ColumnView, typename ColumnDeviceView>
ColumnDeviceView* child_columns_to_device_array(ColumnView const& source, void* h_ptr, void* d_ptr)
{
ColumnDeviceView* d_children = nullptr;
size_type num_children = source.num_children();
if (num_children > 0) {
// The beginning of the memory must be the fixed-sized ColumnDeviceView
// struct objects in order for d_children to be used as an array.
auto h_column = reinterpret_cast<ColumnDeviceView*>(h_ptr);
auto d_column = reinterpret_cast<ColumnDeviceView*>(d_ptr);
// Any child data is assigned past the end of this array: h_end and d_end.
auto h_end = reinterpret_cast<int8_t*>(h_column + num_children);
auto d_end = reinterpret_cast<int8_t*>(d_column + num_children);
d_children = d_column; // set children pointer for return
for (size_type idx = 0; idx < num_children; ++idx) {
// inplace-new each child into host memory
auto child = source.child(idx);
new (h_column) ColumnDeviceView(child, h_end, d_end);
h_column++; // advance to next child
// update the pointers for holding this child column's child data
auto col_child_data_size = ColumnDeviceView::extent(child) - sizeof(ColumnDeviceView);
h_end += col_child_data_size;
d_end += col_child_data_size;
}
}
return d_children;
}

// helper function for column_device_view::create and mutable_column_device::create methods
template <typename ColumnView, typename ColumnDeviceView>
std::unique_ptr<ColumnDeviceView, std::function<void(ColumnDeviceView*)>>
Expand All @@ -104,8 +52,9 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str
thrust::make_counting_iterator(0),
[&source](auto i) { return ColumnDeviceView::extent(source.child(i)); });

auto const descendant_storage_bytes =
std::accumulate(get_extent, get_extent + num_children, std::size_t{0});
// pad the allocation for aligning the first pointer
auto const descendant_storage_bytes = std::accumulate(
get_extent, get_extent + num_children, std::size_t{alignof(ColumnDeviceView) - 1});

// A buffer of CPU memory is allocated to hold the ColumnDeviceView
// objects. Once filled, the CPU memory is copied to device memory
Expand Down Expand Up @@ -150,7 +99,8 @@ column_device_view::column_device_view(column_view source, void* h_ptr, void* d_
source.offset()},
_num_children{source.num_children()}
{
d_children = child_columns_to_device_array<column_view, column_device_view>(source, h_ptr, d_ptr);
d_children = detail::child_columns_to_device_array<column_device_view>(
source.child_begin(), source.child_end(), h_ptr, d_ptr);
}

// Construct a unique_ptr that invokes `destroy()` as it's deleter
Expand Down Expand Up @@ -196,8 +146,8 @@ mutable_column_device_view::mutable_column_device_view(mutable_column_view sourc
source.offset()},
_num_children{source.num_children()}
{
d_children = child_columns_to_device_array<mutable_column_view, mutable_column_device_view>(
source, h_ptr, d_ptr);
d_children = detail::child_columns_to_device_array<mutable_column_device_view>(
source.child_begin(), source.child_end(), h_ptr, d_ptr);
}

// Handle freeing children
Expand Down
56 changes: 26 additions & 30 deletions cpp/src/strings/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cudf/strings/detail/concatenate.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table_device_view.cuh>

#include <thrust/binary_search.h>
#include <thrust/for_each.h>
Expand Down Expand Up @@ -63,37 +64,30 @@ struct chars_size_transform {

auto create_strings_device_views(std::vector<column_view> const& views, cudaStream_t stream)
{
CUDF_FUNC_RANGE();
// Create device views for each input view
using CDViewPtr =
decltype(column_device_view::create(std::declval<column_view>(), std::declval<cudaStream_t>()));
auto device_view_owners = std::vector<CDViewPtr>(views.size());
std::transform(
views.cbegin(), views.cend(), device_view_owners.begin(), [stream](auto const& col) {
return column_device_view::create(col, stream);
});

cudf::thread_range r1{"device_view_owners"};
cudf::thread_range r2{"create_contiguous_device_views"};
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
// Assemble contiguous array of device views
auto device_views = thrust::host_vector<column_device_view>();
device_views.reserve(views.size());
std::transform(device_view_owners.cbegin(),
device_view_owners.cend(),
std::back_inserter(device_views),
[](auto const& col) { return *col; });
auto d_views = rmm::device_vector<column_device_view>{device_views};
rmm::device_buffer* device_view_owners;
column_device_view* device_views_ptr;
std::tie(device_view_owners, device_views_ptr) =
contiguous_copy_column_device_views<column_device_view>(views, stream);

cudf::thread_range r3{"input_offsets"};
// Compute the partition offsets and size of offset column
// Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
auto input_offsets = thrust::host_vector<size_t>(views.size() + 1);
thrust::transform_inclusive_scan(
thrust::host,
device_views.cbegin(),
device_views.cend(),
std::next(input_offsets.begin()),
[](auto const& col) { return static_cast<size_t>(col.size()); },
thrust::plus<size_t>{});
auto offset_it = std::next(input_offsets.begin());
thrust::transform(
thrust::host, views.cbegin(), views.cend(), offset_it, [](auto const& col) -> size_t {
return static_cast<size_t>(col.size());
});
thrust::inclusive_scan(thrust::host, offset_it, input_offsets.end(), offset_it);
auto const d_input_offsets = rmm::device_vector<size_t>{input_offsets};
auto const output_size = input_offsets.back();

cudf::thread_range r4{"d_partition_offsets"};
// Compute the partition offsets and size of chars column
// Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
// Note: Using separate transform and inclusive_scan because
Expand All @@ -102,8 +96,8 @@ auto create_strings_device_views(std::vector<column_view> const& views, cudaStre
// referenced -- it is a deleted function
auto d_partition_offsets = rmm::device_vector<size_t>(views.size() + 1);
thrust::transform(rmm::exec_policy(stream)->on(stream),
d_views.cbegin(),
d_views.cend(),
device_views_ptr,
device_views_ptr + views.size(),
std::next(d_partition_offsets.begin()),
chars_size_transform{});
thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream),
Expand All @@ -112,8 +106,9 @@ auto create_strings_device_views(std::vector<column_view> const& views, cudaStre
d_partition_offsets.begin());
auto const output_chars_size = d_partition_offsets.back();

return std::make_tuple(std::move(device_view_owners),
std::move(d_views),
cudf::thread_range r5{"return_statement"};
return std::make_tuple(std::unique_ptr<rmm::device_buffer>(device_view_owners),
device_views_ptr,
std::move(d_input_offsets),
std::move(d_partition_offsets),
output_size,
Expand Down Expand Up @@ -216,6 +211,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
rmm::mr::device_memory_resource* mr,
cudaStream_t stream)
{
CUDF_FUNC_RANGE();
// Compute output sizes
auto const device_views = create_strings_device_views(columns, stream);
auto const& d_views = std::get<1>(device_views);
Expand Down Expand Up @@ -262,10 +258,10 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
auto const kernel = has_nulls ? fused_concatenate_string_offset_kernel<block_size, true>
: fused_concatenate_string_offset_kernel<block_size, false>;
kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(
d_views.data().get(),
d_views,
d_input_offsets.data().get(),
d_partition_offsets.data().get(),
static_cast<size_type>(d_views.size()),
static_cast<size_type>(columns.size()),
strings_count,
d_new_offsets,
reinterpret_cast<bitmask_type*>(null_mask.data()),
Expand All @@ -282,9 +278,9 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
cudf::detail::grid_1d config(total_bytes, block_size);
auto const kernel = fused_concatenate_string_chars_kernel;
kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(
d_views.data().get(),
d_views,
d_partition_offsets.data().get(),
static_cast<size_type>(d_views.size()),
static_cast<size_type>(columns.size()),
total_bytes,
d_new_chars);
} else {
Expand Down
Loading