Skip to content

Commit

Permalink
Modify make_host_vector and make_device_uvector factories to opti…
Browse files Browse the repository at this point in the history
…onally use pinned memory and kernel copy (#16206)

Issue #15616

Modified `make_host_vector` functions to return `cudf::detail::host_vector`, which can use a pinned or a pageable memory resource. When pinned memory is used, the D2H copy is potentially done using a CUDA kernel.

Also added factories to create `host_vector`s without device data. These are useful to replace uses of `std::vector` and `thrust::host_vector` when the data eventually gets copied to the GPU.

Added `is_device_accessible` to `host_span`. With this, `make_device_uvector` can optionally use the kernel for the H2D copy.

Modified `cudf::detail::host_vector` to be derived from `thrust::host_vector`, to avoid issues with implicit conversion from `std::vector`.

Used `cudf::detail::host_vector` and its new factory functions wherever data ends up copied to the GPU.

Stopped using `thrust::copy_n` for the kernel copy path in `cuda_memcpy` because of an optimization that allows it to fall back to `cudaMemCpyAsync`. We now call a simple local kernel.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - Alessandro Bellina (https://github.com/abellina)

URL: #16206
  • Loading branch information
vuule authored Jul 24, 2024
1 parent 39f256c commit f0efc8b
Show file tree
Hide file tree
Showing 40 changed files with 539 additions and 192 deletions.
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -671,9 +671,9 @@ add_library(
src/unary/null_ops.cu
src/utilities/cuda_memcpy.cu
src/utilities/default_stream.cpp
src/utilities/host_memory.cpp
src/utilities/linked_column.cpp
src/utilities/logger.cpp
src/utilities/pinned_memory.cpp
src/utilities/prefetch.cpp
src/utilities/stacktrace.cpp
src/utilities/stream_pool.cpp
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/cudf/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ void gather_bitmask(table_view const& source,
}

// Make device array of target bitmask pointers
std::vector<bitmask_type*> target_masks(target.size());
auto target_masks = make_host_vector<bitmask_type*>(target.size(), stream);
std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
return col->mutable_view().null_mask();
});
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,9 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
if (num_segments == 0) { return std::vector<size_type>{}; }

// Construct a contiguous host buffer of indices and copy to device.
auto const h_indices = std::vector<size_type>(indices_begin, indices_end);
auto h_indices = make_empty_host_vector<typename std::iterator_traits<IndexIterator>::value_type>(
std::distance(indices_begin, indices_end), stream);
std::copy(indices_begin, indices_end, std::back_inserter(h_indices));
auto const d_indices =
make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource());

Expand Down
51 changes: 51 additions & 0 deletions cpp/include/cudf/detail/utilities/host_memory.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/detail/utilities/host_vector.hpp>
#include <cudf/utilities/export.hpp>
#include <cudf/utilities/pinned_memory.hpp>

#include <rmm/resource_ref.hpp>

#include <cstddef>

namespace cudf::detail {
/**
* @brief Get the memory resource to be used for pageable memory allocations.
*
* @return Reference to the pageable memory resource
*/
CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource();

/**
* @brief Get the allocator to be used for the host memory allocation.
*
* @param size The number of elements of type T to allocate
* @param stream The stream to use for the allocation
* @return The allocator to be used for the host memory allocation
*/
template <typename T>
rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view stream)
{
if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) {
return {get_pinned_memory_resource(), stream};
}
return {get_pageable_memory_resource(), stream};
}

} // namespace cudf::detail
24 changes: 21 additions & 3 deletions cpp/include/cudf/detail/utilities/host_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ class rmm_host_allocator<void> {
};
};

template <class DesiredProperty, class... Properties>
inline constexpr bool contains_property =
(cuda::std::is_same_v<DesiredProperty, Properties> || ... || false);

/*! \p rmm_host_allocator is a CUDA-specific host memory allocator
* that employs \c `rmm::host_async_resource_ref` for allocation.
*
Expand Down Expand Up @@ -100,8 +104,12 @@ class rmm_host_allocator {
/**
* @brief Construct from a `cudf::host_async_resource_ref`
*/
rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream)
: mr(_mr), stream(_stream)
template <class... Properties>
rmm_host_allocator(cuda::mr::async_resource_ref<cuda::mr::host_accessible, Properties...> _mr,
rmm::cuda_stream_view _stream)
: mr(_mr),
stream(_stream),
_is_device_accessible{contains_property<cuda::mr::device_accessible, Properties...>}
{
}

Expand Down Expand Up @@ -173,15 +181,25 @@ class rmm_host_allocator {
*/
inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); }

bool is_device_accessible() const { return _is_device_accessible; }

private:
rmm::host_async_resource_ref mr;
rmm::cuda_stream_view stream;
bool _is_device_accessible;
};

/**
* @brief A vector class with rmm host memory allocator
*/
template <typename T>
using host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
class host_vector : public thrust::host_vector<T, rmm_host_allocator<T>> {
public:
using base = thrust::host_vector<T, rmm_host_allocator<T>>;

host_vector(rmm_host_allocator<T> const& alloc) : base(alloc) {}

host_vector(size_t size, rmm_host_allocator<T> const& alloc) : base(size, alloc) {}
};

} // namespace cudf::detail
106 changes: 72 additions & 34 deletions cpp/include/cudf/detail/utilities/vector_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
* @file vector_factories.hpp
*/

#include <cudf/detail/utilities/cuda_memcpy.hpp>
#include <cudf/detail/utilities/host_memory.hpp>
#include <cudf/detail/utilities/host_vector.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
Expand All @@ -32,8 +34,6 @@
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <thrust/host_vector.h>

#include <vector>

namespace cudf {
Expand Down Expand Up @@ -100,11 +100,12 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
rmm::device_async_resource_ref mr)
{
rmm::device_uvector<T> ret(source_data.size(), stream, mr);
CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
source_data.data(),
source_data.size() * sizeof(T),
cudaMemcpyDefault,
stream.value()));
auto const is_pinned = source_data.is_device_accessible();
cuda_memcpy_async(ret.data(),
source_data.data(),
source_data.size() * sizeof(T),
is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
stream);
return ret;
}

Expand Down Expand Up @@ -271,21 +272,11 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
}

// Utility function template to allow copying to either a thrust::host_vector or std::vector
template <typename T, typename OutContainer>
OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
{
OutContainer result(v.size());
CUDF_CUDA_TRY(cudaMemcpyAsync(
result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
return result;
}

/**
* @brief Asynchronously construct a `std::vector` containing a copy of data from a
* `device_span`
*
* @note This function does not synchronize `stream`.
* @note This function does not synchronize `stream` after the copy.
*
* @tparam T The type of the data to copy
* @param source_data The device data to copy
Expand All @@ -295,14 +286,17 @@ OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view str
template <typename T>
std::vector<T> make_std_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
{
return make_vector_async<T, std::vector<T>>(v, stream);
std::vector<T> result(v.size());
CUDF_CUDA_TRY(cudaMemcpyAsync(
result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
return result;
}

/**
* @brief Asynchronously construct a `std::vector` containing a copy of data from a device
* container
*
* @note This function synchronizes `stream`.
* @note This function synchronizes `stream` after the copy.
*
* @tparam Container The type of the container to copy from
* @tparam T The type of the data to copy
Expand All @@ -324,7 +318,7 @@ std::vector<typename Container::value_type> make_std_vector_async(Container cons
* @brief Synchronously construct a `std::vector` containing a copy of data from a
* `device_span`
*
* @note This function does a synchronize on `stream`.
* @note This function does a synchronize on `stream` after the copy.
*
* @tparam T The type of the data to copy
* @param source_data The device data to copy
Expand Down Expand Up @@ -361,28 +355,71 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
return make_std_vector_sync(device_span<typename Container::value_type const>{c}, stream);
}

/**
* @brief Construct a `cudf::detail::host_vector` of the given size.
*
* @note The returned vector may be using a pinned memory resource.
*
* @tparam T The type of the vector data
* @param size The number of elements in the created vector
* @param stream The stream on which to allocate memory
* @return A host_vector of the given size
*/
template <typename T>
host_vector<T> make_host_vector(size_t size, rmm::cuda_stream_view stream)
{
return host_vector<T>(size, get_host_allocator<T>(size, stream));
}

/**
* @brief Construct an empty `cudf::detail::host_vector` with the given capacity.
*
* @note The returned vector may be using a pinned memory resource.
*
* @tparam T The type of the vector data
* @param capacity Initial capacity of the vector
* @param stream The stream on which to allocate memory
* @return A host_vector with the given capacity
*/
template <typename T>
host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream)
{
auto result = host_vector<T>(get_host_allocator<T>(capacity, stream));
result.reserve(capacity);
return result;
}

/**
* @brief Asynchronously construct a `thrust::host_vector` containing a copy of data from a
* `device_span`
*
* @note This function does not synchronize `stream`.
* @note This function does not synchronize `stream` after the copy. The returned vector may be
* using a pinned memory resource.
*
* @tparam T The type of the data to copy
* @param source_data The device data to copy
* @param stream The stream on which to perform the copy
* @return The data copied to the host
*/
template <typename T>
thrust::host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
{
return make_vector_async<T, thrust::host_vector<T>>(v, stream);
auto result = make_host_vector<T>(v.size(), stream);
auto const is_pinned = result.get_allocator().is_device_accessible();
cuda_memcpy_async(result.data(),
v.data(),
v.size() * sizeof(T),
is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
stream);
return result;
}

/**
* @brief Asynchronously construct a `std::vector` containing a copy of data from a device
* container
*
* @note This function does not synchronize `stream`.
* @note This function does not synchronize `stream` after the copy. The returned vector may be
* using a pinned memory resource.
*
* @tparam Container The type of the container to copy from
* @tparam T The type of the data to copy
Expand All @@ -394,8 +431,8 @@ template <
typename Container,
std::enable_if_t<
std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
thrust::host_vector<typename Container::value_type> make_host_vector_async(
Container const& c, rmm::cuda_stream_view stream)
host_vector<typename Container::value_type> make_host_vector_async(Container const& c,
rmm::cuda_stream_view stream)
{
return make_host_vector_async(device_span<typename Container::value_type const>{c}, stream);
}
Expand All @@ -404,15 +441,16 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
* @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a
* `device_span`
*
* @note This function does a synchronize on `stream`.
* @note This function does a synchronize on `stream` after the copy. The returned vector may be
* using a pinned memory resource.
*
* @tparam T The type of the data to copy
* @param source_data The device data to copy
* @param stream The stream on which to perform the copy
* @return The data copied to the host
*/
template <typename T>
thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
{
auto result = make_host_vector_async(v, stream);
stream.synchronize();
Expand All @@ -423,7 +461,7 @@ thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_s
* @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a device
* container
*
* @note This function synchronizes `stream`.
* @note This function synchronizes `stream` after the copy.
*
* @tparam Container The type of the container to copy from
* @tparam T The type of the data to copy
Expand All @@ -435,16 +473,16 @@ template <
typename Container,
std::enable_if_t<
std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
thrust::host_vector<typename Container::value_type> make_host_vector_sync(
Container const& c, rmm::cuda_stream_view stream)
host_vector<typename Container::value_type> make_host_vector_sync(Container const& c,
rmm::cuda_stream_view stream)
{
return make_host_vector_sync(device_span<typename Container::value_type const>{c}, stream);
}

/**
* @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size
*
* @note This function may not synchronize `stream`.
* @note This function may not synchronize `stream` after the copy.
*
* @tparam T The type of the vector data
* @param size The number of elements in the created vector
Expand All @@ -460,7 +498,7 @@ host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea
/**
* @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size
*
* @note This function synchronizes `stream`.
* @note This function synchronizes `stream` after the copy.
*
* @tparam T The type of the vector data
* @param size The number of elements in the created vector
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/io/text/detail/trie.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,11 @@ struct trie {

match_length.emplace_back(0);

std::vector<trie_node> trie_nodes;
auto token_counts = std::unordered_map<cudf::size_type, int32_t>();
auto trie_nodes = cudf::detail::make_empty_host_vector<trie_node>(tokens.size(), stream);

for (uint32_t i = 0; i < tokens.size(); i++) {
trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]});
trie_nodes.push_back(trie_node{tokens[i], match_length[i], transitions[i]});
token_counts[tokens[i]]++;
}

Expand Down
Loading

0 comments on commit f0efc8b

Please sign in to comment.