Skip to content

Commit

Permalink
Kernel copy for pinned memory (#15934)
Browse files Browse the repository at this point in the history
Issue #15620

Added an API that enables users to set the threshold under which we perform pinned memory copies using a kernel. The default threshold is zero, so there's no change in default behavior.
The API currently only impacts `hostdevice_vector` H<->D synchronization.

The PR adds wrappers for `cudaMemcpyAsync` so we can implement configurable behavior for pageable copies as well (e.g. copy to pinned + kernel copy).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: #15934
  • Loading branch information
vuule authored Jun 27, 2024
1 parent 6eac920 commit f267b1f
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 8 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,7 @@ add_library(
src/unary/math_ops.cu
src/unary/nan_ops.cu
src/unary/null_ops.cu
src/utilities/cuda_memcpy.cu
src/utilities/default_stream.cpp
src/utilities/linked_column.cpp
src/utilities/logger.cpp
Expand Down
53 changes: 53 additions & 0 deletions cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <rmm/cuda_stream_view.hpp>

namespace cudf::detail {

enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };

/**
* @brief Asynchronously copies data between the host and device.
*
* Implementation may use different strategies depending on the size and type of host data.
*
* @param dst Destination memory address
* @param src Source memory address
* @param size Number of bytes to copy
* @param kind Type of host memory
* @param stream CUDA stream used for the copy
*/
void cuda_memcpy_async(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);

/**
* @brief Synchronously copies data between the host and device.
*
* Implementation may use different strategies depending on the size and type of host data.
*
* @param dst Destination memory address
* @param src Source memory address
* @param size Number of bytes to copy
* @param kind Type of host memory
* @param stream CUDA stream used for the copy
*/
void cuda_memcpy(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);

} // namespace cudf::detail
16 changes: 16 additions & 0 deletions cpp/include/cudf/utilities/pinned_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,20 @@ struct pinned_mr_options {
*/
bool config_default_pinned_memory_resource(pinned_mr_options const& opts);

/**
* @brief Set the threshold size for using kernels for pinned memory copies.
*
* @param threshold The threshold size in bytes. If the size of the copy is less than this
* threshold, the copy will be done using kernels. If the size is greater than or equal to this
* threshold, the copy will be done using cudaMemcpyAsync.
*/
void set_kernel_pinned_copy_threshold(size_t threshold);

/**
* @brief Get the threshold size for using kernels for pinned memory copies.
*
* @return The threshold size in bytes.
*/
size_t get_kernel_pinned_copy_threshold();

} // namespace cudf
13 changes: 5 additions & 8 deletions cpp/src/io/utilities/hostdevice_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "hostdevice_span.hpp"

#include <cudf/detail/utilities/cuda_memcpy.hpp>
#include <cudf/detail/utilities/host_vector.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -124,26 +125,22 @@ class hostdevice_vector {

void host_to_device_async(rmm::cuda_stream_view stream)
{
CUDF_CUDA_TRY(
cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
}

void host_to_device_sync(rmm::cuda_stream_view stream)
{
host_to_device_async(stream);
stream.synchronize();
cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
}

void device_to_host_async(rmm::cuda_stream_view stream)
{
CUDF_CUDA_TRY(
cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
}

void device_to_host_sync(rmm::cuda_stream_view stream)
{
device_to_host_async(stream);
stream.synchronize();
cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
}

/**
Expand Down
71 changes: 71 additions & 0 deletions cpp/src/utilities/cuda_memcpy.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/detail/utilities/cuda_memcpy.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/pinned_memory.hpp>

#include <rmm/exec_policy.hpp>

#include <thrust/copy.h>

namespace cudf::detail {

namespace {

void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
{
if (size == 0) return;

if (size < get_kernel_pinned_copy_threshold()) {
thrust::copy_n(rmm::exec_policy_nosync(stream),
static_cast<const char*>(src),
size,
static_cast<char*>(dst));
} else {
CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
}
}

void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
{
if (size == 0) return;

CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
}

}; // namespace

void cuda_memcpy_async(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
{
if (kind == host_memory_kind::PINNED) {
copy_pinned(dst, src, size, stream);
} else if (kind == host_memory_kind::PAGEABLE) {
copy_pageable(dst, src, size, stream);
} else {
CUDF_FAIL("Unsupported host memory kind");
}
}

void cuda_memcpy(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
{
cuda_memcpy_async(dst, src, size, kind, stream);
stream.synchronize();
}

} // namespace cudf::detail
14 changes: 14 additions & 0 deletions cpp/src/utilities/pinned_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,4 +211,18 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts)
return did_configure;
}

CUDF_EXPORT auto& kernel_pinned_copy_threshold()
{
// use cudaMemcpyAsync for all pinned copies
static std::atomic<size_t> threshold = 0;
return threshold;
}

void set_kernel_pinned_copy_threshold(size_t threshold)
{
kernel_pinned_copy_threshold() = threshold;
}

size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); }

} // namespace cudf

0 comments on commit f267b1f

Please sign in to comment.