From f267b1f068ec3e8fd49599fc28afa2fc0464118b Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 26 Jun 2024 19:44:54 -0700 Subject: [PATCH] Kernel copy for pinned memory (#15934) Issue https://github.com/rapidsai/cudf/issues/15620 Added an API that enables users to set the threshold under which we perform pinned memory copies using a kernel. The default threshold is zero, so there's no change in default behavior. The API currently only impacts `hostdevice_vector` H<->D synchronization. The PR adds wrappers for `cudaMemcpyAsync` so we can implement configurable behavior for pageable copies as well (e.g. copy to pinned + kernel copy). Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/15934 --- cpp/CMakeLists.txt | 1 + .../cudf/detail/utilities/cuda_memcpy.hpp | 53 ++++++++++++++ cpp/include/cudf/utilities/pinned_memory.hpp | 16 +++++ cpp/src/io/utilities/hostdevice_vector.hpp | 13 ++-- cpp/src/utilities/cuda_memcpy.cu | 71 +++++++++++++++++++ cpp/src/utilities/pinned_memory.cpp | 14 ++++ 6 files changed, 160 insertions(+), 8 deletions(-) create mode 100644 cpp/include/cudf/detail/utilities/cuda_memcpy.hpp create mode 100644 cpp/src/utilities/cuda_memcpy.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5fd68bfb26c..35cf90411f2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -662,6 +662,7 @@ add_library( src/unary/math_ops.cu src/unary/nan_ops.cu src/unary/null_ops.cu + src/utilities/cuda_memcpy.cu src/utilities/default_stream.cpp src/utilities/linked_column.cpp src/utilities/logger.cpp diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp new file mode 100644 index 00000000000..b66c461ab12 --- /dev/null +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf::detail { + +enum class host_memory_kind : uint8_t { PINNED, PAGEABLE }; + +/** + * @brief Asynchronously copies data between the host and device. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination memory address + * @param src Source memory address + * @param size Number of bytes to copy + * @param kind Type of host memory + * @param stream CUDA stream used for the copy + */ +void cuda_memcpy_async( + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); + +/** + * @brief Synchronously copies data between the host and device. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination memory address + * @param src Source memory address + * @param size Number of bytes to copy + * @param kind Type of host memory + * @param stream CUDA stream used for the copy + */ +void cuda_memcpy( + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); + +} // namespace cudf::detail diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp index b423eab6d38..3e2fa43cb50 100644 --- a/cpp/include/cudf/utilities/pinned_memory.hpp +++ b/cpp/include/cudf/utilities/pinned_memory.hpp @@ -55,4 +55,20 @@ struct pinned_mr_options { */ bool config_default_pinned_memory_resource(pinned_mr_options const& opts); +/** + * @brief Set the threshold size for using kernels for pinned memory copies. + * + * @param threshold The threshold size in bytes. If the size of the copy is less than this + * threshold, the copy will be done using kernels. If the size is greater than or equal to this + * threshold, the copy will be done using cudaMemcpyAsync. + */ +void set_kernel_pinned_copy_threshold(size_t threshold); + +/** + * @brief Get the threshold size for using kernels for pinned memory copies. + * + * @return The threshold size in bytes. + */ +size_t get_kernel_pinned_copy_threshold(); + } // namespace cudf diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 9acd6a1e3a9..aed745c42dd 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -18,6 +18,7 @@ #include "hostdevice_span.hpp" +#include #include #include #include @@ -124,26 +125,22 @@ class hostdevice_vector { void host_to_device_async(rmm::cuda_stream_view stream) { - CUDF_CUDA_TRY( - cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value())); + cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); } void host_to_device_sync(rmm::cuda_stream_view stream) { - host_to_device_async(stream); - stream.synchronize(); + cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); } void device_to_host_async(rmm::cuda_stream_view stream) { - CUDF_CUDA_TRY( - cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value())); + cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); } void device_to_host_sync(rmm::cuda_stream_view stream) { - device_to_host_async(stream); - stream.synchronize(); + cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); } /** diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu new file mode 100644 index 00000000000..3d0822d8545 --- /dev/null +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include + +namespace cudf::detail { + +namespace { + +void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream) +{ + if (size == 0) return; + + if (size < get_kernel_pinned_copy_threshold()) { + thrust::copy_n(rmm::exec_policy_nosync(stream), + static_cast(src), + size, + static_cast(dst)); + } else { + CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream)); + } +} + +void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream) +{ + if (size == 0) return; + + CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream)); +} + +}; // namespace + +void cuda_memcpy_async( + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) +{ + if (kind == host_memory_kind::PINNED) { + copy_pinned(dst, src, size, stream); + } else if (kind == host_memory_kind::PAGEABLE) { + copy_pageable(dst, src, size, stream); + } else { + CUDF_FAIL("Unsupported host memory kind"); + } +} + +void cuda_memcpy( + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) +{ + cuda_memcpy_async(dst, src, size, kind, stream); + stream.synchronize(); +} + +} // namespace cudf::detail diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp index e90b7969b4d..3ea4293fc60 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/pinned_memory.cpp @@ -211,4 +211,18 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts) return did_configure; } +CUDF_EXPORT auto& kernel_pinned_copy_threshold() +{ + // use cudaMemcpyAsync for all pinned copies + static std::atomic threshold = 0; + return threshold; +} + +void set_kernel_pinned_copy_threshold(size_t threshold) +{ + kernel_pinned_copy_threshold() = threshold; +} + +size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); } + } // namespace cudf