Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend device_scalar to optionally use pinned bounce buffer #16947

Merged
merged 33 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
e542de1
class with overridden value(); single use
vuule Sep 26, 2024
0216948
Merge branch 'branch-24.12' of https://github.com/rapidsai/cudf into …
vuule Sep 26, 2024
b74a4d0
clean up + ctor
vuule Sep 26, 2024
b955404
stop using to_device
vuule Sep 26, 2024
20633fe
use in cuIO
vuule Sep 26, 2024
fccd97d
rest of src
vuule Sep 26, 2024
ab18eb9
include
vuule Sep 26, 2024
a272fe7
initial impl
vuule Sep 26, 2024
7c1c918
Merge branch 'branch-24.12' of https://github.com/rapidsai/cudf into …
vuule Sep 27, 2024
c0a2e71
rework API
vuule Sep 27, 2024
db97c3d
impl fix
vuule Sep 27, 2024
80047eb
docs
vuule Sep 27, 2024
6cf40b3
throw when mismatched sizes
vuule Sep 27, 2024
7414926
Merge branch 'branch-24.12' into fea-host-device-copy
vuule Sep 27, 2024
4957e27
Merge branch 'fea-host-device-copy' into fea-pinned-aware-device_scalar
vuule Sep 27, 2024
5997049
Merge branch 'branch-24.12' of https://github.com/rapidsai/cudf into …
vuule Sep 27, 2024
4030a89
fix
vuule Sep 27, 2024
243e12e
bounce buffer
vuule Sep 28, 2024
09b329a
Merge branch 'branch-24.12' into fea-pinned-aware-device_scalar
vuule Sep 28, 2024
381a49a
style
vuule Sep 28, 2024
7e9eb33
Merge branch 'fea-pinned-aware-device_scalar' of https://github.com/v…
vuule Sep 28, 2024
ac13130
Merge branch 'branch-24.12' of https://github.com/rapidsai/cudf into …
vuule Sep 30, 2024
a052495
style
vuule Sep 30, 2024
d584fcc
Merge branch 'branch-24.12' of https://github.com/rapidsai/cudf into …
vuule Oct 1, 2024
f6a9266
remove impl namespace
vuule Oct 1, 2024
d14f371
Merge branch 'fea-host-device-copy' into fea-pinned-aware-device_scalar
vuule Oct 1, 2024
2056834
Merge branch 'branch-24.12' of https://github.com/rapidsai/cudf into …
vuule Oct 1, 2024
931265b
fix minmax with move ctor definition
vuule Oct 2, 2024
e287b89
Merge branch 'branch-24.12' into fea-pinned-aware-device_scalar
vuule Oct 2, 2024
2468636
Merge branch 'branch-24.12' into fea-pinned-aware-device_scalar
vyasr Oct 3, 2024
483ce19
Merge branch 'branch-24.12' into fea-pinned-aware-device_scalar
vuule Oct 4, 2024
80dc0e5
code review
vuule Oct 10, 2024
3d63126
Merge branch 'branch-24.12' into fea-pinned-aware-device_scalar
vuule Oct 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/copy_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/cuda.cuh>
Expand All @@ -36,7 +37,6 @@

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

Expand Down Expand Up @@ -256,7 +256,7 @@ struct scatter_gather_functor {

cudf::detail::grid_1d grid{input.size(), block_size, per_thread};

rmm::device_scalar<cudf::size_type> null_count{0, stream};
cudf::detail::device_scalar<cudf::size_type> null_count{0, stream};
if (output.nullable()) {
// Have to initialize the output mask to all zeros because we may update
// it with atomicOr().
Expand Down
5 changes: 2 additions & 3 deletions cpp/include/cudf/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <rmm/device_scalar.hpp>

#include <cuda/std/optional>
#include <thrust/iterator/iterator_traits.h>

Expand Down Expand Up @@ -171,7 +170,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,

// if we have validity in the output
if (nullable) {
rmm::device_scalar<size_type> valid_count{0, stream};
cudf::detail::device_scalar<size_type> valid_count{0, stream};

// call the kernel
copy_if_else_kernel<block_size, Element, LeftIter, RightIter, FilterFn, true>
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/copy_range.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_view.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/types.hpp>
#include <cudf/utilities/bit.hpp>
Expand All @@ -27,7 +28,6 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>

#include <cub/cub.cuh>
#include <cuda_runtime.h>
Expand Down Expand Up @@ -154,7 +154,7 @@ void copy_range(SourceValueIterator source_value_begin,
auto grid = cudf::detail::grid_1d{num_items, block_size, 1};

if (target.nullable()) {
rmm::device_scalar<size_type> null_count(target.null_count(), stream);
cudf::detail::device_scalar<size_type> null_count(target.null_count(), stream);

auto kernel =
copy_range_kernel<block_size, SourceValueIterator, SourceValidityIterator, T, true>;
Expand Down
92 changes: 92 additions & 0 deletions cpp/include/cudf/detail/device_scalar.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/detail/utilities/cuda_memcpy.hpp>
#include <cudf/detail/utilities/host_vector.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/resource_ref.hpp>

namespace CUDF_EXPORT cudf {
namespace detail {

template <typename T>
class device_scalar : public rmm::device_scalar<T> {
public:
~device_scalar() = default;

device_scalar(device_scalar&&) noexcept = default;
device_scalar& operator=(device_scalar&&) noexcept = default;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious why the move ctor required code but this did not?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Default implementations should be fine in both cases. Compiled fine on 12.5 🤷
I suspect it's an 11.8 compiler bug, but really didn't want to dig into it, with a handy workaround available.


device_scalar(device_scalar const&) = delete;
device_scalar& operator=(device_scalar const&) = delete;

device_scalar() = delete;

explicit device_scalar(
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
: rmm::device_scalar<T>(stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
{
}

explicit device_scalar(
T const& initial_value,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
: rmm::device_scalar<T>(stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
{
bounce_buffer[0] = initial_value;
cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
}

device_scalar(device_scalar const& other,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
: rmm::device_scalar<T>(other, stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
{
}

[[nodiscard]] T value(rmm::cuda_stream_view stream) const
{
cuda_memcpy<T>(bounce_buffer, device_span<T const>{this->data(), 1}, stream);
return bounce_buffer[0];
}

void set_value_async(T const& value, rmm::cuda_stream_view stream)
{
bounce_buffer[0] = value;
cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
}

void set_value_async(T&& value, rmm::cuda_stream_view stream)
{
bounce_buffer[0] = std::move(value);
Copy link
Contributor Author

@vuule vuule Sep 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bonus feature from having a bounce buffer - we don't need to worry about the value lifetime. rmm::device_scalar prohibits passing an rvalue here, but we don't need to.

cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
}

void set_value_to_zero_async(rmm::cuda_stream_view stream) { set_value_async(T{}, stream); }

private:
mutable cudf::detail::host_vector<T> bounce_buffer;
};

} // namespace detail
} // namespace CUDF_EXPORT cudf
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/detail/valid_if.cuh>
Expand All @@ -25,7 +26,6 @@
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/exec_policy.hpp>

#include <cub/block/block_reduce.cuh>
Expand Down Expand Up @@ -165,7 +165,7 @@ size_type inplace_bitmask_binop(Binop op,
"Mask pointer cannot be null");

rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
rmm::device_scalar<size_type> d_counter{0, stream, mr};
cudf::detail::device_scalar<size_type> d_counter{0, stream, mr};
rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);

Expand Down
84 changes: 70 additions & 14 deletions cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,41 +17,97 @@
#pragma once

#include <cudf/utilities/export.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>

namespace CUDF_EXPORT cudf {
namespace detail {

namespace impl {

enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };

void cuda_memcpy_async(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);

} // namespace impl

/**
* @brief Asynchronously copies data between the host and device.
* @brief Asynchronously copies data from host to device memory.
*
* Implementation may use different strategies depending on the size and type of host data.
*
* @param dst Destination memory address
* @param src Source memory address
* @param size Number of bytes to copy
* @param kind Type of host memory
* @param dst Destination device memory
* @param src Source host memory
* @param stream CUDA stream used for the copy
*/
void cuda_memcpy_async(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
template <typename T>
void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
{
CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
auto const is_pinned = src.is_device_accessible();
impl::cuda_memcpy_async(
dst.data(),
src.data(),
src.size_bytes(),
is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE,
stream);
}

/**
* @brief Synchronously copies data between the host and device.
* @brief Asynchronously copies data from device to host memory.
*
* Implementation may use different strategies depending on the size and type of host data.
*
* @param dst Destination memory address
* @param src Source memory address
* @param size Number of bytes to copy
* @param kind Type of host memory
* @param dst Destination host memory
* @param src Source device memory
* @param stream CUDA stream used for the copy
*/
void cuda_memcpy(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
template <typename T>
void cuda_memcpy_async(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
{
CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
auto const is_pinned = dst.is_device_accessible();
impl::cuda_memcpy_async(
dst.data(),
src.data(),
src.size_bytes(),
is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE,
stream);
}

/**
* @brief Synchronously copies data from host to device memory.
*
* Implementation may use different strategies depending on the size and type of host data.
*
* @param dst Destination device memory
* @param src Source host memory
* @param stream CUDA stream used for the copy
*/
template <typename T>
void cuda_memcpy(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
{
cuda_memcpy_async(dst, src, stream);
stream.synchronize();
}

/**
* @brief Synchronously copies data from device to host memory.
*
* Implementation may use different strategies depending on the size and type of host data.
*
* @param dst Destination host memory
* @param src Source device memory
* @param stream CUDA stream used for the copy
*/
template <typename T>
void cuda_memcpy(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
{
cuda_memcpy_async(dst, src, stream);
stream.synchronize();
}

} // namespace detail
} // namespace CUDF_EXPORT cudf
16 changes: 3 additions & 13 deletions cpp/include/cudf/detail/utilities/vector_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,7 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
rmm::device_async_resource_ref mr)
{
rmm::device_uvector<T> ret(source_data.size(), stream, mr);
auto const is_pinned = source_data.is_device_accessible();
cuda_memcpy_async(ret.data(),
source_data.data(),
source_data.size() * sizeof(T),
is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
stream);
cuda_memcpy_async<T>(ret, source_data, stream);
return ret;
}

Expand Down Expand Up @@ -405,13 +400,8 @@ host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str
template <typename T>
host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
{
auto result = make_host_vector<T>(v.size(), stream);
auto const is_pinned = result.get_allocator().is_device_accessible();
cuda_memcpy_async(result.data(),
v.data(),
v.size() * sizeof(T),
is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
stream);
auto result = make_host_vector<T>(v.size(), stream);
cuda_memcpy_async<T>(result, v, stream);
return result;
}

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/types.hpp>
Expand All @@ -25,7 +26,6 @@
#include <cudf/utilities/memory_resource.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>

#include <thrust/distance.h>

Expand Down Expand Up @@ -101,7 +101,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,

size_type null_count{0};
if (size > 0) {
rmm::device_scalar<size_type> valid_count{0, stream};
cudf::detail::device_scalar<size_type> valid_count{0, stream};

constexpr size_type block_size{256};
grid_1d grid{size, block_size};
Expand Down
5 changes: 3 additions & 2 deletions cpp/include/cudf/scalar/scalar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/table/table.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -94,8 +95,8 @@ class scalar {
[[nodiscard]] bool const* validity_data() const;

protected:
data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar
rmm::device_scalar<bool> _is_valid; ///< Device bool signifying validity
data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar
cudf::detail::device_scalar<bool> _is_valid; ///< Device bool signifying validity

/**
* @brief Move constructor for scalar.
Expand Down
Loading
Loading