diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp index 5615121c7..c54fc7937 100644 --- a/include/rmm/device_scalar.hpp +++ b/include/rmm/device_scalar.hpp @@ -52,7 +52,7 @@ class device_scalar { * @param mr Optional, resource with which to allocate. */ explicit device_scalar( - cuda_stream_view const &stream, + cuda_stream_view stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) : buffer{sizeof(T), stream, mr} { @@ -76,7 +76,7 @@ class device_scalar { */ explicit device_scalar( T const &initial_value, - cuda_stream_view const &stream = cuda_stream_view{}, + cuda_stream_view stream = cuda_stream_view{}, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) : buffer{sizeof(T), stream, mr} { @@ -96,7 +96,7 @@ class device_scalar { * @param mr The resource to use for allocating the new `device_scalar` */ device_scalar(device_scalar const &other, - cuda_stream_view const &stream = {}, + cuda_stream_view stream = {}, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) : buffer{other.buffer, stream, mr} { @@ -118,7 +118,7 @@ class device_scalar { * @return T The value of the scalar. * @param stream CUDA stream on which to perform the copy and synchronize. */ - T value(cuda_stream_view const &stream = cuda_stream_view{}) const + T value(cuda_stream_view stream = cuda_stream_view{}) const { T host_value{}; _memcpy(&host_value, buffer.data(), stream); @@ -129,6 +129,9 @@ class device_scalar { /** * @brief Sets the value of the `device_scalar` to the given `host_value`. * + * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when + * `host_value` is zero. + * * @note If the stream specified to this function is different from the stream specified * to the constructor, then appropriate dependencies must be inserted between the streams * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling @@ -138,8 +141,9 @@ class device_scalar { * referenced by `host_value` should not be destroyed or modified until `stream` has been * synchronized. Otherwise, behavior is undefined. * - * @note: This function incurs a host to device memcpy and should be used sparingly. - + * @note: This function incurs a host to device memcpy or device memset and should be used + * sparingly. + * * Example: * \code{cpp} * rmm::device_scalar s; @@ -155,17 +159,16 @@ class device_scalar { * \endcode * * @throws `rmm::cuda_error` if copying `host_value` to device memory fails. - * @throws `rmm::cuda_error` if synchronizing `stream` fails. * * @param host_value The host value which will be copied to device * @param stream CUDA stream on which to perform the copy */ - template - auto set_value(T const &host_value, cuda_stream_view const &stream = cuda_stream_view{}) - -> std::enable_if_t::value, Placeholder> + template + auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{}) + -> std::enable_if_t::value && not std::is_same::value, void> { - if (host_value == T{0}) { - RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(T), stream.value())); + if (host_value == U{0}) { + set_value_zero(stream); } else { _memcpy(buffer.data(), &host_value, stream); } @@ -174,6 +177,49 @@ class device_scalar { /** * @brief Sets the value of the `device_scalar` to the given `host_value`. * + * This specialization for `bool` is optimized to always use `cudaMemsetAsync`. + * + * @note If the stream specified to this function is different from the stream specified + * to the constructor, then appropriate dependencies must be inserted between the streams + * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling + * this function, otherwise there may be a race condition. + * + * This function does not synchronize `stream` before returning. `host_value` is passed by value + * so a host-side copy may be performed before calling a device memset. + * + * @note: This function incurs a device memset. + * + * Example: + * \code{cpp} + * rmm::device_scalar s; + * + * bool v{true}; + * + * // Copies `true` to device storage on `stream`. Does _not_ synchronize + * vec.set_value(v, stream); + * ... + * cudaStreamSynchronize(stream); + * // Synchronization is required before `v` can be modified + * v = false; + * \endcode + * + * @throws `rmm::cuda_error` if the device memset fails. + * + * @param host_value The host value which the scalar will be set to (true or false) + * @param stream CUDA stream on which to perform the device memset + */ + template + auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{}) + -> std::enable_if_t::value, void> + { + RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), host_value, sizeof(bool), stream.value())); + } + + /** + * @brief Sets the value of the `device_scalar` to the given `host_value`. + * + * Specialization for non-fundamental types. + * * @note If the stream specified to this function is different from the stream specified * to the constructor, then appropriate dependencies must be inserted between the streams * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling @@ -187,16 +233,16 @@ class device_scalar { * Example: * \code{cpp} - * rmm::device_scalar s; + * rmm::device_scalar s; * - * int v{42}; + * my_type v{42, "text"}; * * // Copies 42 to device storage on `stream`. Does _not_ synchronize * vec.set_value(v, stream); * ... * cudaStreamSynchronize(stream); * // Synchronization is required before `v` can be modified - * v = 13; + * v.value = 21; * \endcode * * @throws `rmm::cuda_error` if copying `host_value` to device memory fails @@ -205,13 +251,42 @@ class device_scalar { * @param host_value The host value which will be copied to device * @param stream CUDA stream on which to perform the copy */ - template - auto set_value(T const &host_value, cuda_stream_view const &stream = cuda_stream_view{}) - -> std::enable_if_t::value, Placeholder> + template + auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{}) + -> std::enable_if_t::value, void> { _memcpy(buffer.data(), &host_value, stream); } + // Disallow passing literals to set_value to avoid race conditions where the memory holding the + // literal can be freed before the async memcpy / memset executes. + void set_value(T &&host_value, cuda_stream_view stream = cuda_stream_view{}) = delete; + + /** + * @brief Sets the value of the `device_scalar` to zero. + * + * Only supported for fundamental types. + * + * @note If the stream specified to this function is different from the stream specified + * to the constructor, then appropriate dependencies must be inserted between the streams + * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling + * this function, otherwise there may be a race condition. + * + * This function does not synchronize `stream` before returning. + * + * @note: This function incurs a device memset and should be used sparingly. + * + * @throws `rmm::cuda_error` if the device memset fails. + * + * @param stream CUDA stream on which to perform the device memset + */ + template + auto set_value_zero(cuda_stream_view stream = cuda_stream_view{}) + -> std::enable_if_t::value, void> + { + RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(U), stream.value())); + } + /** * @brief Returns pointer to object in device memory. * @@ -241,7 +316,7 @@ class device_scalar { private: rmm::device_buffer buffer{sizeof(T)}; - inline void _memcpy(void *dst, const void *src, cuda_stream_view const &stream) const + inline void _memcpy(void *dst, const void *src, cuda_stream_view stream) const { RMM_CUDA_TRY(cudaMemcpyAsync(dst, src, sizeof(T), cudaMemcpyDefault, stream.value())); } diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index 1b22cb587..295b4a5fa 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -167,8 +167,8 @@ class device_uvector { * Because this function synchronizes the stream `s`, it is safe to destroy or modify the object * referenced by `v` after this function has returned. * - * @note: This function incurs a host to device memcpy and should be used sparingly. - * @note: This function synchronizes `stream`. + * @note This function incurs a host to device memcpy and should be used sparingly. + * @note This function synchronizes `stream`. * * Example: * \code{cpp} @@ -206,7 +206,11 @@ class device_uvector { * referenced by `v` should not be destroyed or modified until `stream` has been synchronized. * Otherwise, behavior is undefined. * - * @note: This function incurs a host to device memcpy and should be used sparingly. + * @note This function incurs a host to device memcpy and should be used sparingly. + * + * @note Calling this function with a literal or other r-value reference for `v` is disallowed + * to prevent the implementation from asynchronously copying from a literal or other implicit + * temporary after it is deleted or goes out of scope. * * Example: * \code{cpp} @@ -236,11 +240,17 @@ class device_uvector { cudaMemcpyAsync(element_ptr(element_index), &v, sizeof(v), cudaMemcpyDefault, s.value())); } + // We delete the r-value reference overload to prevent asynchronously copying from a literal or + // implicit temporary value after it is deleted or goes out of scope. + void set_element_async(std::size_t element_index, + value_type const&& v, + cuda_stream_view s) = delete; + /** * @brief Returns the specified element from device memory * - * @note: This function incurs a device to host memcpy and should be used sparingly. - * @note: This function synchronizes `stream`. + * @note This function incurs a device to host memcpy and should be used sparingly. + * @note This function synchronizes `stream`. * * @throws rmm::out_of_range exception if `element_index >= size()` * @@ -262,7 +272,8 @@ class device_uvector { /** * @brief Returns the first element. * - * @note: This function incurs a device to host memcpy and should be used sparingly. + * @note This function incurs a device-to-host memcpy and should be used sparingly. + * @note This function synchronizes `stream`. * * @throws rmm::out_of_range exception if the vector is empty. * @@ -274,7 +285,8 @@ class device_uvector { /** * @brief Returns the last element. * - * @note: This function incurs a device to host memcpy and should be used sparingly. + * @note This function incurs a device-to-host memcpy and should be used sparingly. + * @note This function synchronizes `stream`. * * @throws rmm::out_of_range exception if the vector is empty. * diff --git a/tests/device_scalar_tests.cpp b/tests/device_scalar_tests.cpp index b9a6def97..8eaa6bbc6 100644 --- a/tests/device_scalar_tests.cpp +++ b/tests/device_scalar_tests.cpp @@ -25,20 +25,43 @@ #include #include #include +#include template struct DeviceScalarTest : public ::testing::Test { + T value{}; rmm::cuda_stream stream{}; rmm::mr::device_memory_resource* mr{rmm::mr::get_current_device_resource()}; - T value{}; std::default_random_engine generator{}; - std::uniform_int_distribution distribution{std::numeric_limits::lowest(), - std::numeric_limits::max()}; - DeviceScalarTest() { value = distribution(generator); } + DeviceScalarTest() { value = random_value(); } + + template ::value, bool> = true> + U random_value() + { + static std::bernoulli_distribution distribution{}; + return distribution(generator); + } + + template < + typename U = T, + std::enable_if_t<(std::is_integral::value && not std::is_same::value), bool> = true> + U random_value() + { + static std::uniform_int_distribution distribution{std::numeric_limits::lowest(), + std::numeric_limits::max()}; + return distribution(generator); + } + + template ::value, bool> = true> + U random_value() + { + static std::normal_distribution distribution{100, 20}; + return distribution(generator); + } }; -using Types = ::testing::Types; +using Types = ::testing::Types; TYPED_TEST_CASE(DeviceScalarTest, Types); @@ -88,7 +111,7 @@ TYPED_TEST(DeviceScalarTest, SetValue) rmm::device_scalar scalar{this->value, this->stream, this->mr}; EXPECT_NE(nullptr, scalar.data()); - auto expected = this->distribution(this->generator); + auto expected = this->random_value(); scalar.set_value(expected, this->stream); EXPECT_EQ(expected, scalar.value(this->stream)); diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp index 9fcffbd43..f3563b50f 100644 --- a/tests/device_uvector_tests.cpp +++ b/tests/device_uvector_tests.cpp @@ -174,8 +174,9 @@ TYPED_TEST(TypedUVectorTest, GetSetElementAsync) auto size = 12345; rmm::device_uvector uv(size, this->stream()); for (std::size_t i = 0; i < uv.size(); ++i) { - uv.set_element_async(i, i, this->stream()); - EXPECT_EQ(static_cast(i), uv.element(i, this->stream())); + auto init = static_cast(i); + uv.set_element_async(i, init, this->stream()); + EXPECT_EQ(init, uv.element(i, this->stream())); } }