Modify make_host_vector and make_device_uvector factories to opti…

…onally use pinned memory and kernel copy (#16206) Issue #15616 Modified `make_host_vector` functions to return `cudf::detail::host_vector`, which can use a pinned or a pageable memory resource. When pinned memory is used, the D2H copy is potentially done using a CUDA kernel. Also added factories to create `host_vector`s without device data. These are useful to replace uses of `std::vector` and `thrust::host_vector` when the data eventually gets copied to the GPU. Added `is_device_accessible` to `host_span`. With this, `make_device_uvector` can optionally use the kernel for the H2D copy. Modified `cudf::detail::host_vector` to be derived from `thrust::host_vector`, to avoid issues with implicit conversion from `std::vector`. Used `cudf::detail::host_vector` and its new factory functions wherever data ends up copied to the GPU. Stopped using `thrust::copy_n` for the kernel copy path in `cuda_memcpy` because of an optimization that allows it to fall back to `cudaMemCpyAsync`. We now call a simple local kernel. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) - Alessandro Bellina (https://github.com/abellina) URL: #16206
rapidsai · Jul 24, 2024 · f0efc8b · f0efc8b
1 parent 39f256c
commit f0efc8b
Show file tree

Hide file tree

Showing 40 changed files with 539 additions and 192 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -671,9 +671,9 @@ add_library(
   src/unary/null_ops.cu
   src/utilities/cuda_memcpy.cu
   src/utilities/default_stream.cpp
+  src/utilities/host_memory.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
-  src/utilities/pinned_memory.cpp
   src/utilities/prefetch.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp

diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
@@ -577,7 +577,7 @@ void gather_bitmask(table_view const& source,
   }
 
   // Make device array of target bitmask pointers
-  std::vector<bitmask_type*> target_masks(target.size());
+  auto target_masks = make_host_vector<bitmask_type*>(target.size(), stream);
   std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
     return col->mutable_view().null_mask();
   });

diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
@@ -430,7 +430,9 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
   if (num_segments == 0) { return std::vector<size_type>{}; }
 
   // Construct a contiguous host buffer of indices and copy to device.
-  auto const h_indices = std::vector<size_type>(indices_begin, indices_end);
+  auto h_indices = make_empty_host_vector<typename std::iterator_traits<IndexIterator>::value_type>(
+    std::distance(indices_begin, indices_end), stream);
+  std::copy(indices_begin, indices_end, std::back_inserter(h_indices));
   auto const d_indices =
     make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource());
 

diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/resource_ref.hpp>
+
+#include <cstddef>
+
+namespace cudf::detail {
+/**
+ * @brief Get the memory resource to be used for pageable memory allocations.
+ *
+ * @return Reference to the pageable memory resource
+ */
+CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource();
+
+/**
+ * @brief Get the allocator to be used for the host memory allocation.
+ *
+ * @param size The number of elements of type T to allocate
+ * @param stream The stream to use for the allocation
+ * @return The allocator to be used for the host memory allocation
+ */
+template <typename T>
+rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) {
+    return {get_pinned_memory_resource(), stream};
+  }
+  return {get_pageable_memory_resource(), stream};
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -61,6 +61,10 @@ class rmm_host_allocator<void> {
   };
 };
 
+template <class DesiredProperty, class... Properties>
+inline constexpr bool contains_property =
+  (cuda::std::is_same_v<DesiredProperty, Properties> || ... || false);
+
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
  *  that employs \c `rmm::host_async_resource_ref` for allocation.
  *
@@ -100,8 +104,12 @@ class rmm_host_allocator {
   /**
    * @brief Construct from a `cudf::host_async_resource_ref`
    */
-  rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream)
-    : mr(_mr), stream(_stream)
+  template <class... Properties>
+  rmm_host_allocator(cuda::mr::async_resource_ref<cuda::mr::host_accessible, Properties...> _mr,
+                     rmm::cuda_stream_view _stream)
+    : mr(_mr),
+      stream(_stream),
+      _is_device_accessible{contains_property<cuda::mr::device_accessible, Properties...>}
   {
   }
 
@@ -173,15 +181,25 @@ class rmm_host_allocator {
    */
   inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); }
 
+  bool is_device_accessible() const { return _is_device_accessible; }
+
  private:
   rmm::host_async_resource_ref mr;
   rmm::cuda_stream_view stream;
+  bool _is_device_accessible;
 };
 
 /**
  * @brief A vector class with rmm host memory allocator
  */
 template <typename T>
-using host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
+class host_vector : public thrust::host_vector<T, rmm_host_allocator<T>> {
+ public:
+  using base = thrust::host_vector<T, rmm_host_allocator<T>>;
+
+  host_vector(rmm_host_allocator<T> const& alloc) : base(alloc) {}
+
+  host_vector(size_t size, rmm_host_allocator<T> const& alloc) : base(size, alloc) {}
+};
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -21,6 +21,8 @@
  * @file vector_factories.hpp
  */
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/host_memory.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -32,8 +34,6 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/host_vector.h>
-
 #include <vector>
 
 namespace cudf {
@@ -100,11 +100,12 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
-                                source_data.data(),
-                                source_data.size() * sizeof(T),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  auto const is_pinned = source_data.is_device_accessible();
+  cuda_memcpy_async(ret.data(),
+                    source_data.data(),
+                    source_data.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
   return ret;
 }
 
@@ -271,21 +272,11 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
 
-// Utility function template to allow copying to either a thrust::host_vector or std::vector
-template <typename T, typename OutContainer>
-OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
-{
-  OutContainer result(v.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
-  return result;
-}
-
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -295,14 +286,17 @@ OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view str
 template <typename T>
 std::vector<T> make_std_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  return make_vector_async<T, std::vector<T>>(v, stream);
+  std::vector<T> result(v.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
+    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
+  return result;
 }
 
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -324,7 +318,7 @@ std::vector<typename Container::value_type> make_std_vector_async(Container cons
  * @brief Synchronously construct a `std::vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does a synchronize on `stream`.
+ * @note This function does a synchronize on `stream` after the copy.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -361,28 +355,71 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
   return make_std_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
 
+/**
+ * @brief Construct a `cudf::detail::host_vector` of the given size.
+ *
+ * @note The returned vector may be using a pinned memory resource.
+ *
+ * @tparam T The type of the vector data
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector of the given size
+ */
+template <typename T>
+host_vector<T> make_host_vector(size_t size, rmm::cuda_stream_view stream)
+{
+  return host_vector<T>(size, get_host_allocator<T>(size, stream));
+}
+
+/**
+ * @brief Construct an empty `cudf::detail::host_vector` with the given capacity.
+ *
+ * @note The returned vector may be using a pinned memory resource.
+ *
+ * @tparam T The type of the vector data
+ * @param capacity Initial capacity of the vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector with the given capacity
+ */
+template <typename T>
+host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream)
+{
+  auto result = host_vector<T>(get_host_allocator<T>(capacity, stream));
+  result.reserve(capacity);
+  return result;
+}
+
 /**
  * @brief Asynchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
  * @param stream The stream on which to perform the copy
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
+host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  return make_vector_async<T, thrust::host_vector<T>>(v, stream);
+  auto result          = make_host_vector<T>(v.size(), stream);
+  auto const is_pinned = result.get_allocator().is_device_accessible();
+  cuda_memcpy_async(result.data(),
+                    v.data(),
+                    v.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
+  return result;
 }
 
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -394,8 +431,8 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-thrust::host_vector<typename Container::value_type> make_host_vector_async(
-  Container const& c, rmm::cuda_stream_view stream)
+host_vector<typename Container::value_type> make_host_vector_async(Container const& c,
+                                                                   rmm::cuda_stream_view stream)
 {
   return make_host_vector_async(device_span<typename Container::value_type const>{c}, stream);
 }
@@ -404,15 +441,16 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
  * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does a synchronize on `stream`.
+ * @note This function does a synchronize on `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
  * @param stream The stream on which to perform the copy
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
+host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
 {
   auto result = make_host_vector_async(v, stream);
   stream.synchronize();
@@ -423,7 +461,7 @@ thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_s
  * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -435,16 +473,16 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-thrust::host_vector<typename Container::value_type> make_host_vector_sync(
-  Container const& c, rmm::cuda_stream_view stream)
+host_vector<typename Container::value_type> make_host_vector_sync(Container const& c,
+                                                                  rmm::cuda_stream_view stream)
 {
   return make_host_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
 
 /**
  * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function may not synchronize `stream`.
+ * @note This function may not synchronize `stream` after the copy.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
@@ -460,7 +498,7 @@ host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea
 /**
  * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector

diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -223,11 +223,11 @@ struct trie {
 
     match_length.emplace_back(0);
 
-    std::vector<trie_node> trie_nodes;
     auto token_counts = std::unordered_map<cudf::size_type, int32_t>();
+    auto trie_nodes   = cudf::detail::make_empty_host_vector<trie_node>(tokens.size(), stream);
 
     for (uint32_t i = 0; i < tokens.size(); i++) {
-      trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]});
+      trie_nodes.push_back(trie_node{tokens[i], match_length[i], transitions[i]});
       token_counts[tokens[i]]++;
     }