Skip to content

Commit

Permalink
Use vector factories for host-device copies. (#9806)
Browse files Browse the repository at this point in the history
See: #9588 (comment)

In a recent PR review, @jrhemstad suggested switching to vector factories for one-way host-device data copying (that is, cases where using a `hostdevice_vector` isn't the right choice).

This PR applies that suggestion more broadly across the code base, replacing a number of simple cases where a (device) vector was being constructed followed by a call to `CUDA_TRY(cudaMemcpyAsync(...))` with the corresponding factory functions. This makes the code a little more concise and encourages broader use of these factory functions in the future.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - MithunR (https://github.com/mythrocks)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - David Wendt (https://github.com/davidwendt)

URL: #9806
  • Loading branch information
bdice authored Dec 6, 2021
1 parent 3b93f5c commit 8ceed73
Show file tree
Hide file tree
Showing 8 changed files with 23 additions and 64 deletions.
10 changes: 2 additions & 8 deletions cpp/src/dictionary/detail/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cudf/detail/indexalator.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/dictionary/detail/concatenate.hpp>
#include <cudf/dictionary/dictionary_column_view.hpp>
#include <cudf/dictionary/dictionary_factories.hpp>
Expand Down Expand Up @@ -104,14 +105,7 @@ struct compute_children_offsets_fn {
[](auto lhs, auto rhs) {
return offsets_pair{lhs.first + rhs.first, lhs.second + rhs.second};
});
auto d_offsets = rmm::device_uvector<offsets_pair>(offsets.size(), stream);
CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
offsets.data(),
offsets.size() * sizeof(offsets_pair),
cudaMemcpyHostToDevice,
stream.value()));
stream.synchronize();
return d_offsets;
return cudf::detail::make_device_uvector_sync(offsets, stream);
}

private:
Expand Down
18 changes: 5 additions & 13 deletions cpp/src/io/orc/timezone.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
*/
#include "timezone.cuh"

#include <cudf/detail/utilities/vector_factories.hpp>

#include <algorithm>
#include <fstream>

Expand Down Expand Up @@ -459,19 +461,9 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
.count();
}

rmm::device_uvector<int64_t> d_ttimes{ttimes.size(), stream};
CUDA_TRY(cudaMemcpyAsync(d_ttimes.data(),
ttimes.data(),
ttimes.size() * sizeof(int64_t),
cudaMemcpyDefault,
stream.value()));
rmm::device_uvector<int32_t> d_offsets{offsets.size(), stream};
CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
offsets.data(),
offsets.size() * sizeof(int32_t),
cudaMemcpyDefault,
stream.value()));
auto const gmt_offset = get_gmt_offset(ttimes, offsets, orc_utc_offset);
rmm::device_uvector<int64_t> d_ttimes = cudf::detail::make_device_uvector_async(ttimes, stream);
rmm::device_uvector<int32_t> d_offsets = cudf::detail::make_device_uvector_async(offsets, stream);
auto const gmt_offset = get_gmt_offset(ttimes, offsets, orc_utc_offset);
stream.synchronize();

return {gmt_offset, std::move(d_ttimes), std::move(d_offsets)};
Expand Down
18 changes: 5 additions & 13 deletions cpp/src/io/parquet/page_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cudf/detail/iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>
Expand Down Expand Up @@ -1716,19 +1717,10 @@ dremel_data get_dremel_data(column_view h_col,
},
stream);

thrust::host_vector<size_type> column_offsets(d_column_offsets.size());
CUDA_TRY(cudaMemcpyAsync(column_offsets.data(),
d_column_offsets.data(),
d_column_offsets.size() * sizeof(size_type),
cudaMemcpyDeviceToHost,
stream.value()));
thrust::host_vector<size_type> column_ends(d_column_ends.size());
CUDA_TRY(cudaMemcpyAsync(column_ends.data(),
d_column_ends.data(),
d_column_ends.size() * sizeof(size_type),
cudaMemcpyDeviceToHost,
stream.value()));

thrust::host_vector<size_type> column_offsets =
cudf::detail::make_host_vector_async(d_column_offsets, stream);
thrust::host_vector<size_type> column_ends =
cudf::detail::make_host_vector_async(d_column_ends, stream);
stream.synchronize();

size_t max_vals_size = 0;
Expand Down
7 changes: 1 addition & 6 deletions cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -673,12 +673,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
_nullability = std::vector<uint8_t>(r_nullability.crbegin(), r_nullability.crend());
// TODO(cp): Explore doing this for all columns in a single go outside this ctor. Maybe using
// hostdevice_vector. Currently this involves a cudaMemcpyAsync for each column.
_d_nullability = rmm::device_uvector<uint8_t>(_nullability.size(), stream);
CUDA_TRY(cudaMemcpyAsync(_d_nullability.data(),
_nullability.data(),
_nullability.size() * sizeof(uint8_t),
cudaMemcpyHostToDevice,
stream.value()));
_d_nullability = cudf::detail::make_device_uvector_async(_nullability, stream);

_is_list = (_max_rep_level > 0);

Expand Down
8 changes: 2 additions & 6 deletions cpp/src/strings/filter_chars.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -126,12 +127,7 @@ std::unique_ptr<column> filter_characters(
characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
return char_range{entry.first, entry.second};
});
rmm::device_uvector<char_range> table(table_size, stream);
CUDA_TRY(cudaMemcpyAsync(table.data(),
htable.data(),
table_size * sizeof(char_range),
cudaMemcpyHostToDevice,
stream.value()));
rmm::device_uvector<char_range> table = cudf::detail::make_device_uvector_async(htable, stream);

auto d_strings = column_device_view::create(strings.parent(), stream);

Expand Down
9 changes: 3 additions & 6 deletions cpp/src/strings/replace/backref_re.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/strings/replace_re.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -116,12 +117,8 @@ std::unique_ptr<column> replace_with_backrefs(

// parse the repl string for back-ref indicators
auto const parse_result = parse_backrefs(repl);
rmm::device_uvector<backref_type> backrefs(parse_result.second.size(), stream);
CUDA_TRY(cudaMemcpyAsync(backrefs.data(),
parse_result.second.data(),
sizeof(backref_type) * backrefs.size(),
cudaMemcpyHostToDevice,
stream.value()));
rmm::device_uvector<backref_type> backrefs =
cudf::detail::make_device_uvector_async(parse_result.second, stream);
string_scalar repl_scalar(parse_result.first, true, stream);
string_view const d_repl_template = repl_scalar.value();

Expand Down
9 changes: 3 additions & 6 deletions cpp/src/strings/translate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -101,12 +102,8 @@ std::unique_ptr<column> translate(
return lhs.first < rhs.first;
});
// copy translate table to device memory
rmm::device_uvector<translate_table> table(htable.size(), stream);
CUDA_TRY(cudaMemcpyAsync(table.data(),
htable.data(),
sizeof(translate_table) * htable.size(),
cudaMemcpyHostToDevice,
stream.value()));
rmm::device_uvector<translate_table> table =
cudf::detail::make_device_uvector_async(htable, stream);

auto d_strings = column_device_view::create(strings.parent(), stream);

Expand Down
8 changes: 2 additions & 6 deletions cpp/src/transform/row_bit_count.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/structs/structs_column_view.hpp>
#include <cudf/table/table_device_view.cuh>
Expand Down Expand Up @@ -496,12 +497,7 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
auto d_cols = contiguous_copy_column_device_views<column_device_view>(cols, stream);

// move stack info to the gpu
rmm::device_uvector<column_info> d_info(info.size(), stream);
CUDA_TRY(cudaMemcpyAsync(d_info.data(),
info.data(),
sizeof(column_info) * info.size(),
cudaMemcpyHostToDevice,
stream.value()));
rmm::device_uvector<column_info> d_info = cudf::detail::make_device_uvector_async(info, stream);

// each thread needs to maintain a stack of row spans of size max_branch_depth. we will use
// shared memory to do this rather than allocating a potentially gigantic temporary buffer
Expand Down

0 comments on commit 8ceed73

Please sign in to comment.