From 0fadc71411ce9249d173371d64efcef10271d1bb Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 10 Mar 2021 15:05:11 +1100 Subject: [PATCH 1/5] Make it invalid to pass a literal value to `rmm::device_uvector::set_value_async` --- include/rmm/device_uvector.hpp | 26 +++++++++++++++++++------- tests/device_uvector_tests.cpp | 5 +++-- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index 1b22cb587..295b4a5fa 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -167,8 +167,8 @@ class device_uvector { * Because this function synchronizes the stream `s`, it is safe to destroy or modify the object * referenced by `v` after this function has returned. * - * @note: This function incurs a host to device memcpy and should be used sparingly. - * @note: This function synchronizes `stream`. + * @note This function incurs a host to device memcpy and should be used sparingly. + * @note This function synchronizes `stream`. * * Example: * \code{cpp} @@ -206,7 +206,11 @@ class device_uvector { * referenced by `v` should not be destroyed or modified until `stream` has been synchronized. * Otherwise, behavior is undefined. * - * @note: This function incurs a host to device memcpy and should be used sparingly. + * @note This function incurs a host to device memcpy and should be used sparingly. + * + * @note Calling this function with a literal or other r-value reference for `v` is disallowed + * to prevent the implementation from asynchronously copying from a literal or other implicit + * temporary after it is deleted or goes out of scope. * * Example: * \code{cpp} @@ -236,11 +240,17 @@ class device_uvector { cudaMemcpyAsync(element_ptr(element_index), &v, sizeof(v), cudaMemcpyDefault, s.value())); } + // We delete the r-value reference overload to prevent asynchronously copying from a literal or + // implicit temporary value after it is deleted or goes out of scope. + void set_element_async(std::size_t element_index, + value_type const&& v, + cuda_stream_view s) = delete; + /** * @brief Returns the specified element from device memory * - * @note: This function incurs a device to host memcpy and should be used sparingly. - * @note: This function synchronizes `stream`. + * @note This function incurs a device to host memcpy and should be used sparingly. + * @note This function synchronizes `stream`. * * @throws rmm::out_of_range exception if `element_index >= size()` * @@ -262,7 +272,8 @@ class device_uvector { /** * @brief Returns the first element. * - * @note: This function incurs a device to host memcpy and should be used sparingly. + * @note This function incurs a device-to-host memcpy and should be used sparingly. + * @note This function synchronizes `stream`. * * @throws rmm::out_of_range exception if the vector is empty. * @@ -274,7 +285,8 @@ class device_uvector { /** * @brief Returns the last element. * - * @note: This function incurs a device to host memcpy and should be used sparingly. + * @note This function incurs a device-to-host memcpy and should be used sparingly. + * @note This function synchronizes `stream`. * * @throws rmm::out_of_range exception if the vector is empty. * diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp index 9fcffbd43..f3563b50f 100644 --- a/tests/device_uvector_tests.cpp +++ b/tests/device_uvector_tests.cpp @@ -174,8 +174,9 @@ TYPED_TEST(TypedUVectorTest, GetSetElementAsync) auto size = 12345; rmm::device_uvector uv(size, this->stream()); for (std::size_t i = 0; i < uv.size(); ++i) { - uv.set_element_async(i, i, this->stream()); - EXPECT_EQ(static_cast(i), uv.element(i, this->stream())); + auto init = static_cast(i); + uv.set_element_async(i, init, this->stream()); + EXPECT_EQ(init, uv.element(i, this->stream())); } } From a37ba8f87edd0423ec298dfae39170f60b305067 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 10 Mar 2021 15:05:29 +1100 Subject: [PATCH 2/5] Make it invalid to pass a literal value to `rmm::device_scalar::set_value` and add `set_value_zero` --- include/rmm/device_scalar.hpp | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp index 5615121c7..fca575bce 100644 --- a/include/rmm/device_scalar.hpp +++ b/include/rmm/device_scalar.hpp @@ -52,7 +52,7 @@ class device_scalar { * @param mr Optional, resource with which to allocate. */ explicit device_scalar( - cuda_stream_view const &stream, + cuda_stream_view stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) : buffer{sizeof(T), stream, mr} { @@ -76,7 +76,7 @@ class device_scalar { */ explicit device_scalar( T const &initial_value, - cuda_stream_view const &stream = cuda_stream_view{}, + cuda_stream_view stream = cuda_stream_view{}, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) : buffer{sizeof(T), stream, mr} { @@ -96,7 +96,7 @@ class device_scalar { * @param mr The resource to use for allocating the new `device_scalar` */ device_scalar(device_scalar const &other, - cuda_stream_view const &stream = {}, + cuda_stream_view stream = {}, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) : buffer{other.buffer, stream, mr} { @@ -118,7 +118,7 @@ class device_scalar { * @return T The value of the scalar. * @param stream CUDA stream on which to perform the copy and synchronize. */ - T value(cuda_stream_view const &stream = cuda_stream_view{}) const + T value(cuda_stream_view stream = cuda_stream_view{}) const { T host_value{}; _memcpy(&host_value, buffer.data(), stream); @@ -161,7 +161,7 @@ class device_scalar { * @param stream CUDA stream on which to perform the copy */ template - auto set_value(T const &host_value, cuda_stream_view const &stream = cuda_stream_view{}) + auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{}) -> std::enable_if_t::value, Placeholder> { if (host_value == T{0}) { @@ -171,6 +171,15 @@ class device_scalar { } } + void set_value(T &&host_value, cuda_stream_view stream = cuda_stream_view{}) = delete; + + template + auto set_value_zero(cuda_stream_view stream = cuda_stream_view{}) + -> std::enable_if_t::value, Placeholder> + { + RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(T), stream.value())); + } + /** * @brief Sets the value of the `device_scalar` to the given `host_value`. * @@ -206,7 +215,7 @@ class device_scalar { * @param stream CUDA stream on which to perform the copy */ template - auto set_value(T const &host_value, cuda_stream_view const &stream = cuda_stream_view{}) + auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{}) -> std::enable_if_t::value, Placeholder> { _memcpy(buffer.data(), &host_value, stream); @@ -241,7 +250,7 @@ class device_scalar { private: rmm::device_buffer buffer{sizeof(T)}; - inline void _memcpy(void *dst, const void *src, cuda_stream_view const &stream) const + inline void _memcpy(void *dst, const void *src, cuda_stream_view stream) const { RMM_CUDA_TRY(cudaMemcpyAsync(dst, src, sizeof(T), cudaMemcpyDefault, stream.value())); } From 378a1140e0c46e331d779fe56a621a532adddaab Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 11 Mar 2021 10:07:20 +1100 Subject: [PATCH 3/5] Add optimized set_value for bools, and `set_value_zero` --- include/rmm/device_scalar.hpp | 107 ++++++++++++++++++++++++++++------ 1 file changed, 88 insertions(+), 19 deletions(-) diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp index fca575bce..b19cc859b 100644 --- a/include/rmm/device_scalar.hpp +++ b/include/rmm/device_scalar.hpp @@ -129,6 +129,9 @@ class device_scalar { /** * @brief Sets the value of the `device_scalar` to the given `host_value`. * + * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when + * `host_value` is zero. + * * @note If the stream specified to this function is different from the stream specified * to the constructor, then appropriate dependencies must be inserted between the streams * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling @@ -138,8 +141,9 @@ class device_scalar { * referenced by `host_value` should not be destroyed or modified until `stream` has been * synchronized. Otherwise, behavior is undefined. * - * @note: This function incurs a host to device memcpy and should be used sparingly. - + * @note: This function incurs a host to device memcpy or device memset and should be used + * sparingly. + * * Example: * \code{cpp} * rmm::device_scalar s; @@ -155,34 +159,70 @@ class device_scalar { * \endcode * * @throws `rmm::cuda_error` if copying `host_value` to device memory fails. - * @throws `rmm::cuda_error` if synchronizing `stream` fails. * * @param host_value The host value which will be copied to device * @param stream CUDA stream on which to perform the copy */ - template - auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{}) - -> std::enable_if_t::value, Placeholder> + template + auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{}) + -> std::enable_if_t::value && not std::is_same::value, void> { - if (host_value == T{0}) { - RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(T), stream.value())); + if (host_value == U{0}) { + set_value_zero(stream); } else { _memcpy(buffer.data(), &host_value, stream); } } - void set_value(T &&host_value, cuda_stream_view stream = cuda_stream_view{}) = delete; - - template - auto set_value_zero(cuda_stream_view stream = cuda_stream_view{}) - -> std::enable_if_t::value, Placeholder> + /** + * @brief Sets the value of the `device_scalar` to the given `host_value`. + * + * This specialization for `bool` is optimized to always use `cudaMemsetAsync`. + * + * @note If the stream specified to this function is different from the stream specified + * to the constructor, then appropriate dependencies must be inserted between the streams + * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling + * this function, otherwise there may be a race condition. + * + * This function does not synchronize `stream` before returning. Therefore, the object + * referenced by `host_value` should not be destroyed or modified until `stream` has been + * synchronized. Otherwise, behavior is undefined. + * + * @note: This function incurs a host to device memcpy or device memset and should be used + * sparingly. + * + * Example: + * \code{cpp} + * rmm::device_scalar s; + * + * bool v{true}; + * + * // Copies 42 to device storage on `stream`. Does _not_ synchronize + * vec.set_value(v, stream); + * ... + * cudaStreamSynchronize(stream); + * // Synchronization is required before `v` can be modified + * v = false; + * \endcode + * + * @throws `rmm::cuda_error` if the device memset fails. + * + * @param host_value The host value which the scalar will be set to (true or false) + * @param stream CUDA stream on which to perform the device memset + */ + template + auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{}) + -> std::enable_if_t::value, void> { - RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(T), stream.value())); + RMM_CUDA_TRY(cudaMemsetAsync( + buffer.data(), host_value == true ? true : false, sizeof(bool), stream.value())); } /** * @brief Sets the value of the `device_scalar` to the given `host_value`. * + * Specialization for non-fundamental types. + * * @note If the stream specified to this function is different from the stream specified * to the constructor, then appropriate dependencies must be inserted between the streams * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling @@ -196,16 +236,16 @@ class device_scalar { * Example: * \code{cpp} - * rmm::device_scalar s; + * rmm::device_scalar s; * - * int v{42}; + * my_type v{42, "text"}; * * // Copies 42 to device storage on `stream`. Does _not_ synchronize * vec.set_value(v, stream); * ... * cudaStreamSynchronize(stream); * // Synchronization is required before `v` can be modified - * v = 13; + * v.value = 21; * \endcode * * @throws `rmm::cuda_error` if copying `host_value` to device memory fails @@ -214,13 +254,42 @@ class device_scalar { * @param host_value The host value which will be copied to device * @param stream CUDA stream on which to perform the copy */ - template + template auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{}) - -> std::enable_if_t::value, Placeholder> + -> std::enable_if_t::value, void> { _memcpy(buffer.data(), &host_value, stream); } + // Disallow passing literals to set_value to avoid race conditions where the memory holding the + // literal can be freed before the async memcpy / memset executes. + void set_value(T &&host_value, cuda_stream_view stream = cuda_stream_view{}) = delete; + + /** + * @brief Sets the value of the `device_scalar` to zero. + * + * Only supported for fundamental types. + * + * @note If the stream specified to this function is different from the stream specified + * to the constructor, then appropriate dependencies must be inserted between the streams + * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling + * this function, otherwise there may be a race condition. + * + * This function does not synchronize `stream` before returning. + * + * @note: This function incurs a device memset and should be used sparingly. + * + * @throws `rmm::cuda_error` if the device memset fails. + * + * @param stream CUDA stream on which to perform the device memset + */ + template + auto set_value_zero(cuda_stream_view stream = cuda_stream_view{}) + -> std::enable_if_t::value, void> + { + RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(U), stream.value())); + } + /** * @brief Returns pointer to object in device memory. * From fded78ff6a013a2eac8b18aa80dc587575cb5ef0 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Tue, 16 Mar 2021 11:59:33 +1100 Subject: [PATCH 4/5] Simplify boolean logic and fix doc error. --- include/rmm/device_scalar.hpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp index b19cc859b..c54fc7937 100644 --- a/include/rmm/device_scalar.hpp +++ b/include/rmm/device_scalar.hpp @@ -184,12 +184,10 @@ class device_scalar { * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling * this function, otherwise there may be a race condition. * - * This function does not synchronize `stream` before returning. Therefore, the object - * referenced by `host_value` should not be destroyed or modified until `stream` has been - * synchronized. Otherwise, behavior is undefined. + * This function does not synchronize `stream` before returning. `host_value` is passed by value + * so a host-side copy may be performed before calling a device memset. * - * @note: This function incurs a host to device memcpy or device memset and should be used - * sparingly. + * @note: This function incurs a device memset. * * Example: * \code{cpp} @@ -197,7 +195,7 @@ class device_scalar { * * bool v{true}; * - * // Copies 42 to device storage on `stream`. Does _not_ synchronize + * // Copies `true` to device storage on `stream`. Does _not_ synchronize * vec.set_value(v, stream); * ... * cudaStreamSynchronize(stream); @@ -214,8 +212,7 @@ class device_scalar { auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{}) -> std::enable_if_t::value, void> { - RMM_CUDA_TRY(cudaMemsetAsync( - buffer.data(), host_value == true ? true : false, sizeof(bool), stream.value())); + RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), host_value, sizeof(bool), stream.value())); } /** From dd21418e033e913a41654dee02ff5a385d5f2c5d Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Tue, 16 Mar 2021 11:59:47 +1100 Subject: [PATCH 5/5] Expand device_scalar tests to bool and floating point types --- tests/device_scalar_tests.cpp | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/tests/device_scalar_tests.cpp b/tests/device_scalar_tests.cpp index b9a6def97..8eaa6bbc6 100644 --- a/tests/device_scalar_tests.cpp +++ b/tests/device_scalar_tests.cpp @@ -25,20 +25,43 @@ #include #include #include +#include template struct DeviceScalarTest : public ::testing::Test { + T value{}; rmm::cuda_stream stream{}; rmm::mr::device_memory_resource* mr{rmm::mr::get_current_device_resource()}; - T value{}; std::default_random_engine generator{}; - std::uniform_int_distribution distribution{std::numeric_limits::lowest(), - std::numeric_limits::max()}; - DeviceScalarTest() { value = distribution(generator); } + DeviceScalarTest() { value = random_value(); } + + template ::value, bool> = true> + U random_value() + { + static std::bernoulli_distribution distribution{}; + return distribution(generator); + } + + template < + typename U = T, + std::enable_if_t<(std::is_integral::value && not std::is_same::value), bool> = true> + U random_value() + { + static std::uniform_int_distribution distribution{std::numeric_limits::lowest(), + std::numeric_limits::max()}; + return distribution(generator); + } + + template ::value, bool> = true> + U random_value() + { + static std::normal_distribution distribution{100, 20}; + return distribution(generator); + } }; -using Types = ::testing::Types; +using Types = ::testing::Types; TYPED_TEST_CASE(DeviceScalarTest, Types); @@ -88,7 +111,7 @@ TYPED_TEST(DeviceScalarTest, SetValue) rmm::device_scalar scalar{this->value, this->stream, this->mr}; EXPECT_NE(nullptr, scalar.data()); - auto expected = this->distribution(this->generator); + auto expected = this->random_value(); scalar.set_value(expected, this->stream); EXPECT_EQ(expected, scalar.value(this->stream));