Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kernel copy for pinned memory #15934

Merged
merged 79 commits into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from 74 commits
Commits
Show all changes
79 commits
Select commit Hold shift + click to select a range
eb39019
remove pinned_host_vector
vuule May 30, 2024
24b1245
switch to host_device resource ref
vuule May 30, 2024
6c896f6
rebrand host memory resource
vuule May 31, 2024
0048c59
style
vuule May 31, 2024
1964523
java update because breaking
vuule May 31, 2024
f871ca0
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule May 31, 2024
ac0ce9c
java fix
vuule May 31, 2024
b610ba3
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule May 31, 2024
ab36162
move test out of io util
vuule May 31, 2024
69a1bce
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 3, 2024
83f665a
missed rename
vuule Jun 3, 2024
659cabc
Merge branch 'branch-24.08' into fea-pinned-vector-factory
vuule Jun 3, 2024
c1ae478
update benchmark changes
vuule Jun 3, 2024
b1a1582
Merge branch 'fea-pinned-vector-factory' of https://github.com/vuule/…
vuule Jun 3, 2024
707dfc7
Merge branch 'branch-24.08' into fea-pinned-vector-factory
vuule Jun 3, 2024
1c09d0c
rename rmm_host_vector
vuule Jun 4, 2024
c343c31
remove do_xyz
vuule Jun 4, 2024
25ddc4f
Merge branch 'fea-pinned-vector-factory' of https://github.com/vuule/…
vuule Jun 4, 2024
3fc988b
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 4, 2024
50f4d3e
comment
vuule Jun 4, 2024
8dfbd07
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 4, 2024
e429840
Merge branch 'fea-pinned-vector-factory' into fea-smart-copy
vuule Jun 4, 2024
e5af490
works
vuule Jun 5, 2024
9082ccc
include style
vuule Jun 5, 2024
054a98a
Merge branch 'branch-24.08' into fea-pinned-vector-factory
vuule Jun 5, 2024
17b1ee0
reviews
vuule Jun 6, 2024
e3c344b
Merge branch 'fea-pinned-vector-factory' of https://github.com/vuule/…
vuule Jun 6, 2024
ea6408f
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 6, 2024
2dbb68f
available_device_memory
vuule Jun 6, 2024
cb9cc22
reviews
vuule Jun 6, 2024
cf67a14
expand anon namespace
vuule Jun 6, 2024
24c1549
host_uvector
vuule Jun 7, 2024
9c97833
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 7, 2024
075deca
style
vuule Jun 7, 2024
164fce2
docs; prefixes
vuule Jun 7, 2024
b566bab
type aliases in host_uvector
vuule Jun 7, 2024
21edb53
refactor host_ticket
vuule Jun 7, 2024
3814797
style
vuule Jun 7, 2024
168609d
Merge branch 'fea-pinned-vector-factory' into fea-smart-copy
vuule Jun 10, 2024
3ef149d
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 10, 2024
c933157
style
vuule Jun 10, 2024
6784e07
more style
vuule Jun 10, 2024
a49789c
Merge branch 'branch-24.08' into fea-smart-copy
vuule Jun 10, 2024
ba06fbd
Merge branch 'fea-pinned-vector-factory' of https://github.com/vuule/…
vuule Jun 10, 2024
f7999aa
Revert "type aliases in host_uvector"
vuule Jun 10, 2024
c9a82d0
Revert "docs; prefixes"
vuule Jun 10, 2024
930efef
Revert "style"
vuule Jun 10, 2024
0466949
Revert "host_uvector"
vuule Jun 10, 2024
f312219
make do without host_uvector
vuule Jun 11, 2024
7cfee0a
missed change
vuule Jun 11, 2024
fe4d668
style
vuule Jun 11, 2024
52f4a96
Merge branch 'fea-pinned-vector-factory' into fea-smart-copy
vuule Jun 11, 2024
4c2b7cf
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 12, 2024
e2c8613
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 12, 2024
5a71f77
rename
vuule Jun 12, 2024
9068642
refactor
vuule Jun 12, 2024
2ec4670
missing newlines
vuule Jun 17, 2024
a886eb4
rename files
vuule Jun 17, 2024
0dae691
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 17, 2024
dd1fba8
Merge branch 'branch-24.08' into fea-smart-copy
vuule Jun 18, 2024
59ed0dd
Merge branch 'branch-24.08' into fea-smart-copy
vuule Jun 18, 2024
c6ef5f1
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 18, 2024
dcaeaba
test commit, please ignore
vuule Jun 18, 2024
e75808c
Merge branch 'fea-smart-copy' of https://github.com/vuule/cudf into f…
vuule Jun 18, 2024
d50f145
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 24, 2024
0a2742f
fix typo
vuule Jun 24, 2024
68a03f1
typeless API
vuule Jun 24, 2024
b63b393
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 24, 2024
336c7e0
Merge branch 'branch-24.08' of https://github.com/rapidsai/cudf into …
vuule Jun 25, 2024
1741037
sorthidth
vuule Jun 25, 2024
fff667b
simplify
vuule Jun 26, 2024
da2c009
Merge branch 'fea-smart-copy' of https://github.com/vuule/cudf into f…
vuule Jun 26, 2024
1bbd574
add missing break
vuule Jun 26, 2024
692f775
Merge branch 'branch-24.08' into fea-smart-copy
vuule Jun 26, 2024
ce58c46
lines
vuule Jun 27, 2024
d897984
Merge branch 'branch-24.08' into fea-smart-copy
vuule Jun 27, 2024
49d65b8
use if/else
vuule Jun 27, 2024
84a1797
Merge branch 'fea-smart-copy' of https://github.com/vuule/cudf into f…
vuule Jun 27, 2024
0b2aa13
Merge branch 'branch-24.08' into fea-smart-copy
vuule Jun 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,7 @@ add_library(
src/unary/math_ops.cu
src/unary/nan_ops.cu
src/unary/null_ops.cu
src/utilities/cuda_memcpy.cu
src/utilities/default_stream.cpp
src/utilities/linked_column.cpp
src/utilities/logger.cpp
Expand Down
53 changes: 53 additions & 0 deletions cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <rmm/cuda_stream_view.hpp>

namespace cudf::detail {

enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };

/**
* @brief Asynchronously copies data between the host and device.
*
* Implementation may use different strategies depending on the size and type of host data.
*
* @param dst Destination memory address
* @param src Source memory address
* @param size Number of bytes to copy
* @param kind Type of host memory
* @param stream CUDA stream used for the copy
*/
void cuda_memcpy_async(
Copy link
Contributor

@bdice bdice Jun 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want another name for this, given that it does not always call cudaMemcpyAsync? Proposing: cudf_memcpy_async.

(Happy to go either way on this, the status quo is fine.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like to include cudf in the name when it's already in the cudf namespace. Named it this way to make it obvious that it replaces the use of cudaMemcpyAsync. That said, I could probably be convinced to rename it, not tied to any specific name.

Copy link
Contributor

@vyasr vyasr Jun 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm inclined to agree, I don't like duplicating the namespace name in objects already within the namespace. That only encourages bad practices like using declarations to import the namespace members.

void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);

/**
* @brief Synchronously copies data between the host and device.
*
* Implementation may use different strategies depending on the size and type of host data.
*
* @param dst Destination memory address
* @param src Source memory address
* @param size Number of bytes to copy
* @param kind Type of host memory
* @param stream CUDA stream used for the copy
*/
void cuda_memcpy(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);

} // namespace cudf::detail
16 changes: 16 additions & 0 deletions cpp/include/cudf/utilities/pinned_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,20 @@ struct pinned_mr_options {
*/
bool config_default_pinned_memory_resource(pinned_mr_options const& opts);

/**
* @brief Set the threshold size for using kernels for pinned memory copies.
*
* @param threshold The threshold size in bytes. If the size of the copy is less than this
* threshold, the copy will be done using kernels. If the size is greater than or equal to this
* threshold, the copy will be done using cudaMemcpyAsync.
Comment on lines +61 to +63
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any "magic" sizes where we expect one strategy to outperform the other? (A page size, a multiple of 1 kiB or similar) Or is this purely empirical?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair to say that we don't know what the right value is for this (yet?). It's likely to be empirical, since the only goal is to avoid too many copies going through the copy engine.

Copy link
Contributor

@bdice bdice Jun 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let’s do a sweep over threshold values for the next steps where we enable this more broadly. I would like something closer to a microbenchmark (copy back and forth for different sizes with different thresholds?) than the multithreaded Parquet benchmark.

*/
void set_kernel_pinned_copy_threshold(size_t threshold);

/**
* @brief Get the threshold size for using kernels for pinned memory copies.
*
* @return The threshold size in bytes.
*/
size_t get_kernel_pinned_copy_threshold();

} // namespace cudf
13 changes: 5 additions & 8 deletions cpp/src/io/utilities/hostdevice_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "hostdevice_span.hpp"

#include <cudf/detail/utilities/cuda_memcpy.hpp>
#include <cudf/detail/utilities/host_vector.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -124,26 +125,22 @@ class hostdevice_vector {

void host_to_device_async(rmm::cuda_stream_view stream)
{
CUDF_CUDA_TRY(
cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
}

void host_to_device_sync(rmm::cuda_stream_view stream)
{
host_to_device_async(stream);
stream.synchronize();
cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
}

void device_to_host_async(rmm::cuda_stream_view stream)
{
CUDF_CUDA_TRY(
cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
}

void device_to_host_sync(rmm::cuda_stream_view stream)
{
device_to_host_async(stream);
stream.synchronize();
cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
}

/**
Expand Down
69 changes: 69 additions & 0 deletions cpp/src/utilities/cuda_memcpy.cu
Copy link
Contributor Author

@vuule vuule Jun 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently the code is pretty repetitive/pointless. Implementation is meant to leave room for more complex behavior without changes to the API in cuda_memcpy.hpp.

Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/detail/utilities/cuda_memcpy.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/pinned_memory.hpp>

#include <rmm/exec_policy.hpp>

#include <thrust/copy.h>

namespace cudf::detail {

namespace {

void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
{
if (size == 0) return;

if (size < get_kernel_pinned_copy_threshold()) {
thrust::copy_n(rmm::exec_policy_nosync(stream),
static_cast<const char*>(src),
size,
static_cast<char*>(dst));
} else {
CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
}
}

void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
{
if (size == 0) return;

CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
}

}; // namespace

void cuda_memcpy_async(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
{
switch (kind) {
case host_memory_kind::PINNED: copy_pinned(dst, src, size, stream); break;
case host_memory_kind::PAGEABLE:
default: copy_pageable(dst, src, size, stream); break;
}
}

void cuda_memcpy(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
{
cuda_memcpy_async(dst, src, size, kind, stream);
stream.synchronize();
}

} // namespace cudf::detail
14 changes: 14 additions & 0 deletions cpp/src/utilities/pinned_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,4 +211,18 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts)
return did_configure;
}

CUDF_EXPORT auto& kernel_pinned_copy_threshold()
{
// use cudaMemcpyAsync for all pinned copies
static std::atomic<size_t> threshold = 0;
return threshold;
}

void set_kernel_pinned_copy_threshold(size_t threshold)
{
kernel_pinned_copy_threshold() = threshold;
}

size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); }

} // namespace cudf
Loading