Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reduce HtoD copies in cudf::concatenate #6605 #6605

Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
cd3fbf9
add create_contiguous_device_views (with profiling)
karthikeyann Oct 27, 2020
3a9ca3a
fix include, use cudf::thread_range
karthikeyann Oct 28, 2020
f30f8df
changelog entry for PR #6605
karthikeyann Oct 28, 2020
705fcbe
add list_of_column_device_views to reuse table_device_view_base code
karthikeyann Oct 28, 2020
2ca290e
share the template code for contiguous column_device_view copy
karthikeyann Oct 29, 2020
9c4dfab
typo fix
karthikeyann Oct 29, 2020
de44eee
Merge branch 'branch-0.17' of github.com:rapidsai/cudf into enh-strin…
karthikeyann Oct 29, 2020
c2e700a
Merge branch 'branch-0.17' of github.com:rapidsai/cudf into enh-strin…
karthikeyann Nov 5, 2020
088c4c1
align device pointer for children columns copy to device memory
karthikeyann Nov 10, 2020
117bdbd
Merge branch 'branch-0.17' of github.com:rapidsai/cudf into enh-strin…
karthikeyann Nov 10, 2020
a4f58ff
move alias_ptr_for_type
karthikeyann Nov 19, 2020
aee16b9
Merge branch 'branch-0.17' of github.com:rapidsai/cudf into enh-strin…
karthikeyann Nov 19, 2020
bdedebb
Apply suggestions from code review (harrism)
karthikeyann Nov 20, 2020
40cee88
stylefix, remove profile code
karthikeyann Nov 20, 2020
9ba2b6f
review comments
karthikeyann Nov 23, 2020
4480956
Merge branch 'branch-0.17' of github.com:rapidsai/cudf into enh-strin…
karthikeyann Nov 23, 2020
eebd99c
align before child columns call
karthikeyann Nov 24, 2020
a87dd4a
typo fix
karthikeyann Nov 24, 2020
0bf94bc
Merge branch 'branch-0.17' into enh-string_concatenate_HtoD_copies
karthikeyann Nov 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
- PR #6597 Use thread-local to track CUDA device in JNI
- PR #6599 Replace `size()==0` with `empty()`, `is_empty()`
- PR #6514 Initial work for decimal type in Java/JNI
- PR #6605 Reduce HtoD copies in `cudf::concatenate` of string columns
- PR #6608 Improve subword tokenizer docs
- PR #6612 Update JNI to new RMM cuda_stream_view API

Expand Down
56 changes: 54 additions & 2 deletions cpp/include/cudf/table/table_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include <cassert>
#include <memory>
#include <numeric>

/**
* @file table_device_view.cuh
Expand All @@ -39,9 +40,9 @@ class table_device_view_base {
table_device_view_base& operator=(table_device_view_base const&) = default;
table_device_view_base& operator=(table_device_view_base&&) = default;

__device__ ColumnDeviceView* begin() const noexcept { return _columns; }
__device__ __host__ ColumnDeviceView* begin() const noexcept { return _columns; }

__device__ ColumnDeviceView* end() const noexcept { return _columns + _num_columns; }
__device__ __host__ ColumnDeviceView* end() const noexcept { return _columns + _num_columns; }

__device__ ColumnDeviceView const& column(size_type column_index) const noexcept
{
Expand Down Expand Up @@ -109,4 +110,55 @@ class mutable_table_device_view
{
}
};

template <typename ColumnDeviceView, typename HostTableView>
auto contiguous_copy_column_device_views(HostTableView source_view, cudaStream_t stream)
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
{
//
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
// First calculate the size of memory needed to hold the
// table's ColumnDeviceViews. This is done by calling extent()
// for each of the table's ColumnViews columns.
std::size_t views_size_bytes =
std::accumulate(source_view.begin(), source_view.end(), 0, [](std::size_t init, auto col) {
return init + ColumnDeviceView::extent(col);
});
auto num_columns = std::distance(source_view.begin(), source_view.end());
// A buffer of CPU memory is allocated to hold the ColumnDeviceView
// objects. Once filled, the CPU memory is then copied to device memory
// and the pointer is set in the _columns member.
std::vector<int8_t> h_buffer(views_size_bytes);
ColumnDeviceView* h_column = reinterpret_cast<ColumnDeviceView*>(h_buffer.data());
// Each ColumnDeviceView instance may have child objects which may
// require setting some internal device pointers before being copied
// from CPU to device.
// Allocate the device memory to be used in the result.
// We need this pointer in order to pass it down when creating the
// ColumnDeviceViews so the column can set the pointer(s) for any
// of its child objects.
auto _descendant_storage = new rmm::device_buffer(views_size_bytes, stream);
auto _columns = reinterpret_cast<ColumnDeviceView*>(_descendant_storage->data());
// The beginning of the memory must be the fixed-sized ColumnDeviceView
// objects in order for _columns to be used as an array. Therefore,
// any child data is assigned to the end of this array (h_end/d_end).
auto h_end = (int8_t*)(h_column + num_columns);
auto d_end = (int8_t*)(_columns + num_columns);
// Create the ColumnDeviceView from each column within the CPU memory
// Any column child data should be copied into h_end and any
// internal pointers should be set using d_end.
for (auto itr = source_view.begin(); itr != source_view.end(); ++itr) {
auto col = *itr;
// convert the ColumnView into ColumnDeviceView
new (h_column) ColumnDeviceView(col, h_end, d_end);
jrhemstad marked this conversation as resolved.
Show resolved Hide resolved
h_column++; // point to memory slot for the next ColumnDeviceView
// update the pointers for holding ColumnDeviceView's child data
auto col_child_data_size = (ColumnDeviceView::extent(col) - sizeof(ColumnDeviceView));
h_end += col_child_data_size;
d_end += col_child_data_size;
}

CUDA_TRY(cudaMemcpyAsync(_columns, h_buffer.data(), views_size_bytes, cudaMemcpyDefault, stream));
CUDA_TRY(cudaStreamSynchronize(stream));
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
return _descendant_storage;
}

} // namespace cudf
53 changes: 24 additions & 29 deletions cpp/src/strings/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/strings/detail/concatenate.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table_device_view.cuh>

#include <thrust/binary_search.h>
#include <thrust/for_each.h>
Expand Down Expand Up @@ -62,37 +63,29 @@ struct chars_size_transform {

auto create_strings_device_views(std::vector<column_view> const& views, cudaStream_t stream)
{
CUDF_FUNC_RANGE();
// Create device views for each input view
using CDViewPtr =
decltype(column_device_view::create(std::declval<column_view>(), std::declval<cudaStream_t>()));
auto device_view_owners = std::vector<CDViewPtr>(views.size());
std::transform(
views.cbegin(), views.cend(), device_view_owners.begin(), [stream](auto const& col) {
return column_device_view::create(col, stream);
});

cudf::thread_range r1{"device_view_owners"};
cudf::thread_range r2{"create_contiguous_device_views"};
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
// Assemble contiguous array of device views
auto device_views = thrust::host_vector<column_device_view>();
device_views.reserve(views.size());
std::transform(device_view_owners.cbegin(),
device_view_owners.cend(),
std::back_inserter(device_views),
[](auto const& col) { return *col; });
auto d_views = rmm::device_vector<column_device_view>{device_views};
auto device_view_owners = std::unique_ptr<rmm::device_buffer>(
contiguous_copy_column_device_views<column_device_view>(views, stream));
auto device_views_ptr = reinterpret_cast<column_device_view*>(device_view_owners->data());

cudf::thread_range r3{"input_offsets"};
// Compute the partition offsets and size of offset column
// Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
auto input_offsets = thrust::host_vector<size_t>(views.size() + 1);
thrust::transform_inclusive_scan(
thrust::host,
device_views.cbegin(),
device_views.cend(),
std::next(input_offsets.begin()),
[](auto const& col) { return static_cast<size_t>(col.size()); },
thrust::plus<size_t>{});
auto offset_it = std::next(input_offsets.begin());
thrust::transform(
thrust::host, views.cbegin(), views.cend(), offset_it, [](auto const& col) -> size_t {
return static_cast<size_t>(col.size());
});
thrust::inclusive_scan(thrust::host, offset_it, input_offsets.end(), offset_it);
auto const d_input_offsets = rmm::device_vector<size_t>{input_offsets};
auto const output_size = input_offsets.back();

cudf::thread_range r4{"d_partition_offsets"};
// Compute the partition offsets and size of chars column
// Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
// Note: Using separate transform and inclusive_scan because
Expand All @@ -101,8 +94,8 @@ auto create_strings_device_views(std::vector<column_view> const& views, cudaStre
// referenced -- it is a deleted function
auto d_partition_offsets = rmm::device_vector<size_t>(views.size() + 1);
thrust::transform(rmm::exec_policy(stream)->on(stream),
d_views.cbegin(),
d_views.cend(),
device_views_ptr,
device_views_ptr + views.size(),
std::next(d_partition_offsets.begin()),
chars_size_transform{});
thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream),
Expand All @@ -111,8 +104,9 @@ auto create_strings_device_views(std::vector<column_view> const& views, cudaStre
d_partition_offsets.begin());
auto const output_chars_size = d_partition_offsets.back();

cudf::thread_range r5{"return_statement"};
return std::make_tuple(std::move(device_view_owners),
std::move(d_views),
device_views_ptr,
std::move(d_input_offsets),
std::move(d_partition_offsets),
output_size,
Expand Down Expand Up @@ -215,6 +209,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
rmm::mr::device_memory_resource* mr,
cudaStream_t stream)
{
CUDF_FUNC_RANGE();
// Compute output sizes
auto const device_views = create_strings_device_views(columns, stream);
auto const& d_views = std::get<1>(device_views);
Expand Down Expand Up @@ -260,10 +255,10 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
auto const kernel = has_nulls ? fused_concatenate_string_offset_kernel<block_size, true>
: fused_concatenate_string_offset_kernel<block_size, false>;
kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(
d_views.data().get(),
d_views,
d_input_offsets.data().get(),
d_partition_offsets.data().get(),
static_cast<size_type>(d_views.size()),
static_cast<size_type>(columns.size()),
strings_count,
d_new_offsets,
reinterpret_cast<bitmask_type*>(null_mask.data()),
Expand All @@ -280,9 +275,9 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
cudf::detail::grid_1d config(total_bytes, block_size);
auto const kernel = fused_concatenate_string_chars_kernel;
kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(
d_views.data().get(),
d_views,
d_partition_offsets.data().get(),
static_cast<size_type>(d_views.size()),
static_cast<size_type>(columns.size()),
total_bytes,
d_new_chars);
} else {
Expand Down
51 changes: 3 additions & 48 deletions cpp/src/table/table_device_view.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/error.hpp>

#include <algorithm>
#include <numeric>
#include <vector>

namespace cudf {
namespace detail {
template <typename ColumnDeviceView, typename HostTableView>
Expand All @@ -41,50 +37,9 @@ table_device_view_base<ColumnDeviceView, HostTableView>::table_device_view_base(
// objects and copied into device memory for the table_device_view's
// _columns member.
if (source_view.num_columns() > 0) {
//
// First calculate the size of memory needed to hold the
// table's ColumnDeviceViews. This is done by calling extent()
// for each of the table's ColumnViews columns.
std::size_t views_size_bytes =
std::accumulate(source_view.begin(), source_view.end(), 0, [](std::size_t init, auto col) {
return init + ColumnDeviceView::extent(col);
});
// A buffer of CPU memory is allocated to hold the ColumnDeviceView
// objects. Once filled, the CPU memory is then copied to device memory
// and the pointer is set in the _columns member.
std::vector<int8_t> h_buffer(views_size_bytes);
ColumnDeviceView* h_column = reinterpret_cast<ColumnDeviceView*>(h_buffer.data());
// Each ColumnDeviceView instance may have child objects which may
// require setting some internal device pointers before being copied
// from CPU to device.
// Allocate the device memory to be used in the result.
// We need this pointer in order to pass it down when creating the
// ColumnDeviceViews so the column can set the pointer(s) for any
// of its child objects.
_descendant_storage = new rmm::device_buffer(views_size_bytes, stream);
_columns = reinterpret_cast<ColumnDeviceView*>(_descendant_storage->data());
// The beginning of the memory must be the fixed-sized ColumnDeviceView
// objects in order for _columns to be used as an array. Therefore,
// any child data is assigned to the end of this array (h_end/d_end).
auto h_end = (int8_t*)(h_column + source_view.num_columns());
auto d_end = (int8_t*)(_columns + source_view.num_columns());
// Create the ColumnDeviceView from each column within the CPU memory
// Any column child data should be copied into h_end and any
// internal pointers should be set using d_end.
for (auto itr = source_view.begin(); itr != source_view.end(); ++itr) {
auto col = *itr;
// convert the ColumnView into ColumnDeviceView
new (h_column) ColumnDeviceView(col, h_end, d_end);
h_column++; // point to memory slot for the next ColumnDeviceView
// update the pointers for holding ColumnDeviceView's child data
auto col_child_data_size = (ColumnDeviceView::extent(col) - sizeof(ColumnDeviceView));
h_end += col_child_data_size;
d_end += col_child_data_size;
}

CUDA_TRY(
cudaMemcpyAsync(_columns, h_buffer.data(), views_size_bytes, cudaMemcpyDefault, stream));
CUDA_TRY(cudaStreamSynchronize(stream));
_descendant_storage =
contiguous_copy_column_device_views<ColumnDeviceView, HostTableView>(source_view, stream);
_columns = reinterpret_cast<ColumnDeviceView*>(_descendant_storage->data());
}
}

Expand Down
37 changes: 37 additions & 0 deletions cpp/tests/copying/concatenate_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <cudf/dictionary/encode.hpp>
#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/table/table.hpp>
#include <string>

#include <thrust/sequence.h>

Expand Down Expand Up @@ -146,6 +147,42 @@ TEST_F(StringColumnTest, ConcatenateColumnView)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringColumnTest, ConcatenateTooManyColumns)
{
std::vector<const char*> h_strings{"aaa",
"bb",
"",
"cccc",
"d",
"ééé",
"ff",
"gggg",
"",
"h",
"iiii",
"jjj",
"k",
"lllllll",
"mmmmm",
"n",
"oo",
"ppp"};

std::vector<const char*> expected_strings;
std::vector<cudf::test::strings_column_wrapper> wrappers;
std::vector<cudf::column_view> strings_columns;
std::string expected_string;
for (int i = 0; i < 200; ++i) {
wrappers.emplace_back(h_strings.data(), h_strings.data() + h_strings.size());
strings_columns.push_back(wrappers[i]);
expected_strings.insert(expected_strings.end(), h_strings.begin(), h_strings.end());
}
cudf::test::strings_column_wrapper expected(expected_strings.data(),
expected_strings.data() + expected_strings.size());
auto results = cudf::concatenate(strings_columns);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

struct TableTest : public cudf::test::BaseFixture {
};

Expand Down