-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Kernel copy for pinned memory #15934
Changes from 74 commits
eb39019
24b1245
6c896f6
0048c59
1964523
f871ca0
ac0ce9c
b610ba3
ab36162
69a1bce
83f665a
659cabc
c1ae478
b1a1582
707dfc7
1c09d0c
c343c31
25ddc4f
3fc988b
50f4d3e
8dfbd07
e429840
e5af490
9082ccc
054a98a
17b1ee0
e3c344b
ea6408f
2dbb68f
cb9cc22
cf67a14
24c1549
9c97833
075deca
164fce2
b566bab
21edb53
3814797
168609d
3ef149d
c933157
6784e07
a49789c
ba06fbd
f7999aa
c9a82d0
930efef
0466949
f312219
7cfee0a
fe4d668
52f4a96
4c2b7cf
e2c8613
5a71f77
9068642
2ec4670
a886eb4
0dae691
dd1fba8
59ed0dd
c6ef5f1
dcaeaba
e75808c
d50f145
0a2742f
68a03f1
b63b393
336c7e0
1741037
fff667b
da2c009
1bbd574
692f775
ce58c46
d897984
49d65b8
84a1797
0b2aa13
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <rmm/cuda_stream_view.hpp> | ||
|
||
namespace cudf::detail { | ||
|
||
enum class host_memory_kind : uint8_t { PINNED, PAGEABLE }; | ||
|
||
/** | ||
* @brief Asynchronously copies data between the host and device. | ||
* | ||
* Implementation may use different strategies depending on the size and type of host data. | ||
* | ||
* @param dst Destination memory address | ||
* @param src Source memory address | ||
* @param size Number of bytes to copy | ||
* @param kind Type of host memory | ||
* @param stream CUDA stream used for the copy | ||
*/ | ||
void cuda_memcpy_async( | ||
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); | ||
|
||
/** | ||
* @brief Synchronously copies data between the host and device. | ||
* | ||
* Implementation may use different strategies depending on the size and type of host data. | ||
* | ||
* @param dst Destination memory address | ||
* @param src Source memory address | ||
* @param size Number of bytes to copy | ||
* @param kind Type of host memory | ||
* @param stream CUDA stream used for the copy | ||
*/ | ||
void cuda_memcpy( | ||
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); | ||
|
||
} // namespace cudf::detail |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,4 +55,20 @@ struct pinned_mr_options { | |
*/ | ||
bool config_default_pinned_memory_resource(pinned_mr_options const& opts); | ||
|
||
/** | ||
* @brief Set the threshold size for using kernels for pinned memory copies. | ||
* | ||
* @param threshold The threshold size in bytes. If the size of the copy is less than this | ||
* threshold, the copy will be done using kernels. If the size is greater than or equal to this | ||
* threshold, the copy will be done using cudaMemcpyAsync. | ||
Comment on lines
+61
to
+63
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are there any "magic" sizes where we expect one strategy to outperform the other? (A page size, a multiple of 1 kiB or similar) Or is this purely empirical? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair to say that we don't know what the right value is for this (yet?). It's likely to be empirical, since the only goal is to avoid too many copies going through the copy engine. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let’s do a sweep over threshold values for the next steps where we enable this more broadly. I would like something closer to a microbenchmark (copy back and forth for different sizes with different thresholds?) than the multithreaded Parquet benchmark. |
||
*/ | ||
void set_kernel_pinned_copy_threshold(size_t threshold); | ||
|
||
/** | ||
* @brief Get the threshold size for using kernels for pinned memory copies. | ||
* | ||
* @return The threshold size in bytes. | ||
*/ | ||
size_t get_kernel_pinned_copy_threshold(); | ||
|
||
} // namespace cudf |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently the code is pretty repetitive/pointless. Implementation is meant to leave room for more complex behavior without changes to the API in cuda_memcpy.hpp. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <cudf/detail/utilities/cuda_memcpy.hpp> | ||
#include <cudf/utilities/error.hpp> | ||
#include <cudf/utilities/pinned_memory.hpp> | ||
|
||
#include <rmm/exec_policy.hpp> | ||
|
||
#include <thrust/copy.h> | ||
|
||
namespace cudf::detail { | ||
|
||
namespace { | ||
|
||
void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream) | ||
{ | ||
if (size == 0) return; | ||
|
||
if (size < get_kernel_pinned_copy_threshold()) { | ||
thrust::copy_n(rmm::exec_policy_nosync(stream), | ||
static_cast<const char*>(src), | ||
size, | ||
static_cast<char*>(dst)); | ||
} else { | ||
CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream)); | ||
} | ||
} | ||
|
||
void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream) | ||
{ | ||
if (size == 0) return; | ||
|
||
CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream)); | ||
} | ||
|
||
}; // namespace | ||
|
||
void cuda_memcpy_async( | ||
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) | ||
{ | ||
switch (kind) { | ||
case host_memory_kind::PINNED: copy_pinned(dst, src, size, stream); break; | ||
case host_memory_kind::PAGEABLE: | ||
default: copy_pageable(dst, src, size, stream); break; | ||
} | ||
} | ||
|
||
void cuda_memcpy( | ||
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) | ||
{ | ||
cuda_memcpy_async(dst, src, size, kind, stream); | ||
stream.synchronize(); | ||
} | ||
|
||
} // namespace cudf::detail |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we want another name for this, given that it does not always call
cudaMemcpyAsync
? Proposing:cudf_memcpy_async
.(Happy to go either way on this, the status quo is fine.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't like to include cudf in the name when it's already in the
cudf
namespace. Named it this way to make it obvious that it replaces the use ofcudaMemcpyAsync
. That said, I could probably be convinced to rename it, not tied to any specific name.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm inclined to agree, I don't like duplicating the namespace name in objects already within the namespace. That only encourages bad practices like using declarations to import the namespace members.