From f267b1f068ec3e8fd49599fc28afa2fc0464118b Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 26 Jun 2024 19:44:54 -0700
Subject: [PATCH] Kernel copy for pinned memory (#15934)

Issue https://github.com/rapidsai/cudf/issues/15620

Added an API that enables users to set the threshold under which we perform pinned memory copies using a kernel. The default threshold is zero, so there's no change in default behavior.
The API currently only impacts `hostdevice_vector` H<->D synchronization.

The PR adds wrappers for `cudaMemcpyAsync` so we can implement configurable behavior for pageable copies as well (e.g. copy to pinned + kernel copy).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15934
---
 cpp/CMakeLists.txt                            |  1 +
 .../cudf/detail/utilities/cuda_memcpy.hpp     | 53 ++++++++++++++
 cpp/include/cudf/utilities/pinned_memory.hpp  | 16 +++++
 cpp/src/io/utilities/hostdevice_vector.hpp    | 13 ++--
 cpp/src/utilities/cuda_memcpy.cu              | 71 +++++++++++++++++++
 cpp/src/utilities/pinned_memory.cpp           | 14 ++++
 6 files changed, 160 insertions(+), 8 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
 create mode 100644 cpp/src/utilities/cuda_memcpy.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5fd68bfb26c..35cf90411f2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -662,6 +662,7 @@ add_library(
   src/unary/math_ops.cu
   src/unary/nan_ops.cu
   src/unary/null_ops.cu
+  src/utilities/cuda_memcpy.cu
   src/utilities/default_stream.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
new file mode 100644
index 00000000000..b66c461ab12
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::detail {
+
+enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
+
+/**
+ * @brief Asynchronously copies data between the host and device.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param size Number of bytes to copy
+ * @param kind Type of host memory
+ * @param stream CUDA stream used for the copy
+ */
+void cuda_memcpy_async(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
+/**
+ * @brief Synchronously copies data between the host and device.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param size Number of bytes to copy
+ * @param kind Type of host memory
+ * @param stream CUDA stream used for the copy
+ */
+void cuda_memcpy(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index b423eab6d38..3e2fa43cb50 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -55,4 +55,20 @@ struct pinned_mr_options {
  */
 bool config_default_pinned_memory_resource(pinned_mr_options const& opts);
 
+/**
+ * @brief Set the threshold size for using kernels for pinned memory copies.
+ *
+ * @param threshold The threshold size in bytes. If the size of the copy is less than this
+ * threshold, the copy will be done using kernels. If the size is greater than or equal to this
+ * threshold, the copy will be done using cudaMemcpyAsync.
+ */
+void set_kernel_pinned_copy_threshold(size_t threshold);
+
+/**
+ * @brief Get the threshold size for using kernels for pinned memory copies.
+ *
+ * @return The threshold size in bytes.
+ */
+size_t get_kernel_pinned_copy_threshold();
+
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 9acd6a1e3a9..aed745c42dd 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -18,6 +18,7 @@
 
 #include "hostdevice_span.hpp"
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -124,26 +125,22 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void host_to_device_sync(rmm::cuda_stream_view stream)
   {
-    host_to_device_async(stream);
-    stream.synchronize();
+    cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void device_to_host_sync(rmm::cuda_stream_view stream)
   {
-    device_to_host_async(stream);
-    stream.synchronize();
+    cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   /**
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
new file mode 100644
index 00000000000..3d0822d8545
--- /dev/null
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+
+namespace cudf::detail {
+
+namespace {
+
+void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size == 0) return;
+
+  if (size < get_kernel_pinned_copy_threshold()) {
+    thrust::copy_n(rmm::exec_policy_nosync(stream),
+                   static_cast<const char*>(src),
+                   size,
+                   static_cast<char*>(dst));
+  } else {
+    CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
+  }
+}
+
+void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size == 0) return;
+
+  CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
+}
+
+};  // namespace
+
+void cuda_memcpy_async(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
+{
+  if (kind == host_memory_kind::PINNED) {
+    copy_pinned(dst, src, size, stream);
+  } else if (kind == host_memory_kind::PAGEABLE) {
+    copy_pageable(dst, src, size, stream);
+  } else {
+    CUDF_FAIL("Unsupported host memory kind");
+  }
+}
+
+void cuda_memcpy(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, size, kind, stream);
+  stream.synchronize();
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index e90b7969b4d..3ea4293fc60 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -211,4 +211,18 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts)
   return did_configure;
 }
 
+CUDF_EXPORT auto& kernel_pinned_copy_threshold()
+{
+  // use cudaMemcpyAsync for all pinned copies
+  static std::atomic<size_t> threshold = 0;
+  return threshold;
+}
+
+void set_kernel_pinned_copy_threshold(size_t threshold)
+{
+  kernel_pinned_copy_threshold() = threshold;
+}
+
+size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); }
+
 }  // namespace cudf