From 0178d8310c78488e7aaa212f96c65b09c5bd3b39 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 30 Nov 2021 21:17:37 -0600 Subject: [PATCH] Use vector factories instead of constructing empty vectors and calling cudaMemcpyAsync. --- cpp/src/dictionary/detail/concatenate.cu | 10 ++-------- cpp/src/io/orc/timezone.cpp | 18 +++++------------- cpp/src/io/parquet/page_enc.cu | 18 +++++------------- cpp/src/io/parquet/writer_impl.cu | 7 +------ cpp/src/strings/filter_chars.cu | 8 ++------ cpp/src/strings/replace/backref_re.cu | 9 +++------ cpp/src/strings/translate.cu | 9 +++------ cpp/src/transform/row_bit_count.cu | 8 ++------ 8 files changed, 23 insertions(+), 64 deletions(-) diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index a3cac6ac5c1..fd86d8ec7d4 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -104,14 +105,7 @@ struct compute_children_offsets_fn { [](auto lhs, auto rhs) { return offsets_pair{lhs.first + rhs.first, lhs.second + rhs.second}; }); - auto d_offsets = rmm::device_uvector(offsets.size(), stream); - CUDA_TRY(cudaMemcpyAsync(d_offsets.data(), - offsets.data(), - offsets.size() * sizeof(offsets_pair), - cudaMemcpyHostToDevice, - stream.value())); - stream.synchronize(); - return d_offsets; + return cudf::detail::make_device_uvector_sync(offsets, stream); } private: diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index 77fde0d1e75..3a1e8bf898a 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -15,6 +15,8 @@ */ #include "timezone.cuh" +#include + #include #include @@ -459,19 +461,9 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name, .count(); } - rmm::device_uvector d_ttimes{ttimes.size(), stream}; - CUDA_TRY(cudaMemcpyAsync(d_ttimes.data(), - ttimes.data(), - ttimes.size() * sizeof(int64_t), - cudaMemcpyDefault, - stream.value())); - rmm::device_uvector d_offsets{offsets.size(), stream}; - CUDA_TRY(cudaMemcpyAsync(d_offsets.data(), - offsets.data(), - offsets.size() * sizeof(int32_t), - cudaMemcpyDefault, - stream.value())); - auto const gmt_offset = get_gmt_offset(ttimes, offsets, orc_utc_offset); + rmm::device_uvector d_ttimes = cudf::detail::make_device_uvector_async(ttimes, stream); + rmm::device_uvector d_offsets = cudf::detail::make_device_uvector_async(offsets, stream); + auto const gmt_offset = get_gmt_offset(ttimes, offsets, orc_utc_offset); stream.synchronize(); return {gmt_offset, std::move(d_ttimes), std::move(d_offsets)}; diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 48490426db7..3ca53d9e651 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -1716,19 +1717,10 @@ dremel_data get_dremel_data(column_view h_col, }, stream); - thrust::host_vector column_offsets(d_column_offsets.size()); - CUDA_TRY(cudaMemcpyAsync(column_offsets.data(), - d_column_offsets.data(), - d_column_offsets.size() * sizeof(size_type), - cudaMemcpyDeviceToHost, - stream.value())); - thrust::host_vector column_ends(d_column_ends.size()); - CUDA_TRY(cudaMemcpyAsync(column_ends.data(), - d_column_ends.data(), - d_column_ends.size() * sizeof(size_type), - cudaMemcpyDeviceToHost, - stream.value())); - + thrust::host_vector column_offsets = + cudf::detail::make_host_vector_async(d_column_offsets, stream); + thrust::host_vector column_ends = + cudf::detail::make_host_vector_async(d_column_ends, stream); stream.synchronize(); size_t max_vals_size = 0; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 62803432157..d1101b24d7e 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -673,12 +673,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, _nullability = std::vector(r_nullability.crbegin(), r_nullability.crend()); // TODO(cp): Explore doing this for all columns in a single go outside this ctor. Maybe using // hostdevice_vector. Currently this involves a cudaMemcpyAsync for each column. - _d_nullability = rmm::device_uvector(_nullability.size(), stream); - CUDA_TRY(cudaMemcpyAsync(_d_nullability.data(), - _nullability.data(), - _nullability.size() * sizeof(uint8_t), - cudaMemcpyHostToDevice, - stream.value())); + _d_nullability = cudf::detail::make_device_uvector_async(_nullability, stream); _is_list = (_max_rep_level > 0); diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu index 883a7fada75..7e45a609d34 100644 --- a/cpp/src/strings/filter_chars.cu +++ b/cpp/src/strings/filter_chars.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -126,12 +127,7 @@ std::unique_ptr filter_characters( characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) { return char_range{entry.first, entry.second}; }); - rmm::device_uvector table(table_size, stream); - CUDA_TRY(cudaMemcpyAsync(table.data(), - htable.data(), - table_size * sizeof(char_range), - cudaMemcpyHostToDevice, - stream.value())); + rmm::device_uvector table = cudf::detail::make_device_uvector_async(htable, stream); auto d_strings = column_device_view::create(strings.parent(), stream); diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index 87603e4c35b..99c55998fb9 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -116,12 +117,8 @@ std::unique_ptr replace_with_backrefs( // parse the repl string for back-ref indicators auto const parse_result = parse_backrefs(repl); - rmm::device_uvector backrefs(parse_result.second.size(), stream); - CUDA_TRY(cudaMemcpyAsync(backrefs.data(), - parse_result.second.data(), - sizeof(backref_type) * backrefs.size(), - cudaMemcpyHostToDevice, - stream.value())); + rmm::device_uvector backrefs = + cudf::detail::make_device_uvector_async(parse_result.second, stream); string_scalar repl_scalar(parse_result.first, true, stream); string_view const d_repl_template = repl_scalar.value(); diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index ad3515e8058..8761deab4a4 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -101,12 +102,8 @@ std::unique_ptr translate( return lhs.first < rhs.first; }); // copy translate table to device memory - rmm::device_uvector table(htable.size(), stream); - CUDA_TRY(cudaMemcpyAsync(table.data(), - htable.data(), - sizeof(translate_table) * htable.size(), - cudaMemcpyHostToDevice, - stream.value())); + rmm::device_uvector table = + cudf::detail::make_device_uvector_async(htable, stream); auto d_strings = column_device_view::create(strings.parent(), stream); diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index 06b03a6b36f..ff720daa5cb 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -496,12 +497,7 @@ std::unique_ptr row_bit_count(table_view const& t, auto d_cols = contiguous_copy_column_device_views(cols, stream); // move stack info to the gpu - rmm::device_uvector d_info(info.size(), stream); - CUDA_TRY(cudaMemcpyAsync(d_info.data(), - info.data(), - sizeof(column_info) * info.size(), - cudaMemcpyHostToDevice, - stream.value())); + rmm::device_uvector d_info = cudf::detail::make_device_uvector_async(info, stream); // each thread needs to maintain a stack of row spans of size max_branch_depth. we will use // shared memory to do this rather than allocating a potentially gigantic temporary buffer