Use vector factories for host-device copies. (#9806)

See: #9588 (comment) In a recent PR review, @jrhemstad suggested switching to vector factories for one-way host-device data copying (that is, cases where using a `hostdevice_vector` isn't the right choice). This PR applies that suggestion more broadly across the code base, replacing a number of simple cases where a (device) vector was being constructed followed by a call to `CUDA_TRY(cudaMemcpyAsync(...))` with the corresponding factory functions. This makes the code a little more concise and encourages broader use of these factory functions in the future. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Karthikeyan (https://github.com/karthikeyann) - MithunR (https://github.com/mythrocks) - Mike Wilson (https://github.com/hyperbolic2346) - David Wendt (https://github.com/davidwendt) URL: #9806
rapidsai · Dec 6, 2021 · 8ceed73 · 8ceed73
1 parent 3b93f5c
commit 8ceed73
Show file tree

Hide file tree

Showing 8 changed files with 23 additions and 64 deletions.
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/concatenate.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
@@ -104,14 +105,7 @@ struct compute_children_offsets_fn {
       [](auto lhs, auto rhs) {
         return offsets_pair{lhs.first + rhs.first, lhs.second + rhs.second};
       });
-    auto d_offsets = rmm::device_uvector<offsets_pair>(offsets.size(), stream);
-    CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
-                             offsets.data(),
-                             offsets.size() * sizeof(offsets_pair),
-                             cudaMemcpyHostToDevice,
-                             stream.value()));
-    stream.synchronize();
-    return d_offsets;
+    return cudf::detail::make_device_uvector_sync(offsets, stream);
   }
 
  private:

diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp
@@ -15,6 +15,8 @@
  */
 #include "timezone.cuh"
 
+#include <cudf/detail/utilities/vector_factories.hpp>
+
 #include <algorithm>
 #include <fstream>
 
@@ -459,19 +461,9 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
                         .count();
   }
 
-  rmm::device_uvector<int64_t> d_ttimes{ttimes.size(), stream};
-  CUDA_TRY(cudaMemcpyAsync(d_ttimes.data(),
-                           ttimes.data(),
-                           ttimes.size() * sizeof(int64_t),
-                           cudaMemcpyDefault,
-                           stream.value()));
-  rmm::device_uvector<int32_t> d_offsets{offsets.size(), stream};
-  CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
-                           offsets.data(),
-                           offsets.size() * sizeof(int32_t),
-                           cudaMemcpyDefault,
-                           stream.value()));
-  auto const gmt_offset = get_gmt_offset(ttimes, offsets, orc_utc_offset);
+  rmm::device_uvector<int64_t> d_ttimes  = cudf::detail::make_device_uvector_async(ttimes, stream);
+  rmm::device_uvector<int32_t> d_offsets = cudf::detail::make_device_uvector_async(offsets, stream);
+  auto const gmt_offset                  = get_gmt_offset(ttimes, offsets, orc_utc_offset);
   stream.synchronize();
 
   return {gmt_offset, std::move(d_ttimes), std::move(d_offsets)};

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -1716,19 +1717,10 @@ dremel_data get_dremel_data(column_view h_col,
     },
     stream);
 
-  thrust::host_vector<size_type> column_offsets(d_column_offsets.size());
-  CUDA_TRY(cudaMemcpyAsync(column_offsets.data(),
-                           d_column_offsets.data(),
-                           d_column_offsets.size() * sizeof(size_type),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
-  thrust::host_vector<size_type> column_ends(d_column_ends.size());
-  CUDA_TRY(cudaMemcpyAsync(column_ends.data(),
-                           d_column_ends.data(),
-                           d_column_ends.size() * sizeof(size_type),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
-
+  thrust::host_vector<size_type> column_offsets =
+    cudf::detail::make_host_vector_async(d_column_offsets, stream);
+  thrust::host_vector<size_type> column_ends =
+    cudf::detail::make_host_vector_async(d_column_ends, stream);
   stream.synchronize();
 
   size_t max_vals_size = 0;

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
@@ -673,12 +673,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   _nullability = std::vector<uint8_t>(r_nullability.crbegin(), r_nullability.crend());
   // TODO(cp): Explore doing this for all columns in a single go outside this ctor. Maybe using
   // hostdevice_vector. Currently this involves a cudaMemcpyAsync for each column.
-  _d_nullability = rmm::device_uvector<uint8_t>(_nullability.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(_d_nullability.data(),
-                           _nullability.data(),
-                           _nullability.size() * sizeof(uint8_t),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  _d_nullability = cudf::detail::make_device_uvector_async(_nullability, stream);
 
   _is_list = (_max_rep_level > 0);
 

diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -126,12 +127,7 @@ std::unique_ptr<column> filter_characters(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
     });
-  rmm::device_uvector<char_range> table(table_size, stream);
-  CUDA_TRY(cudaMemcpyAsync(table.data(),
-                           htable.data(),
-                           table_size * sizeof(char_range),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  rmm::device_uvector<char_range> table = cudf::detail::make_device_uvector_async(htable, stream);
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 

diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
@@ -24,6 +24,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -116,12 +117,8 @@ std::unique_ptr<column> replace_with_backrefs(
 
   // parse the repl string for back-ref indicators
   auto const parse_result = parse_backrefs(repl);
-  rmm::device_uvector<backref_type> backrefs(parse_result.second.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(backrefs.data(),
-                           parse_result.second.data(),
-                           sizeof(backref_type) * backrefs.size(),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  rmm::device_uvector<backref_type> backrefs =
+    cudf::detail::make_device_uvector_async(parse_result.second, stream);
   string_scalar repl_scalar(parse_result.first, true, stream);
   string_view const d_repl_template = repl_scalar.value();
 

diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -101,12 +102,8 @@ std::unique_ptr<column> translate(
     return lhs.first < rhs.first;
   });
   // copy translate table to device memory
-  rmm::device_uvector<translate_table> table(htable.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(table.data(),
-                           htable.data(),
-                           sizeof(translate_table) * htable.size(),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  rmm::device_uvector<translate_table> table =
+    cudf::detail::make_device_uvector_async(htable, stream);
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 

diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -496,12 +497,7 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
   auto d_cols = contiguous_copy_column_device_views<column_device_view>(cols, stream);
 
   // move stack info to the gpu
-  rmm::device_uvector<column_info> d_info(info.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(d_info.data(),
-                           info.data(),
-                           sizeof(column_info) * info.size(),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  rmm::device_uvector<column_info> d_info = cudf::detail::make_device_uvector_async(info, stream);
 
   // each thread needs to maintain a stack of row spans of size max_branch_depth. we will use
   // shared memory to do this rather than allocating a potentially gigantic temporary buffer