From 0fadc71411ce9249d173371d64efcef10271d1bb Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Wed, 10 Mar 2021 15:05:11 +1100
Subject: [PATCH 1/5] Make it invalid to pass a literal value to
 `rmm::device_uvector::set_value_async`

---
 include/rmm/device_uvector.hpp | 26 +++++++++++++++++++-------
 tests/device_uvector_tests.cpp |  5 +++--
 2 files changed, 22 insertions(+), 9 deletions(-)
diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp
index 1b22cb587..295b4a5fa 100644
--- a/include/rmm/device_uvector.hpp
+++ b/include/rmm/device_uvector.hpp
@@ -167,8 +167,8 @@ class device_uvector {
    * Because this function synchronizes the stream `s`, it is safe to destroy or modify the object
    * referenced by `v` after this function has returned.
    *
-   * @note: This function incurs a host to device memcpy and should be used sparingly.
-   * @note: This function synchronizes `stream`.
+   * @note This function incurs a host to device memcpy and should be used sparingly.
+   * @note This function synchronizes `stream`.
    *
    * Example:
    * \code{cpp}
@@ -206,7 +206,11 @@ class device_uvector {
    * referenced by `v` should not be destroyed or modified until `stream` has been synchronized.
    * Otherwise, behavior is undefined.
    *
-   * @note: This function incurs a host to device memcpy and should be used sparingly.
+   * @note This function incurs a host to device memcpy and should be used sparingly.
+   *
+   * @note Calling this function with a literal or other r-value reference for `v` is disallowed
+   *  to prevent the implementation from asynchronously copying from a literal or other implicit
+   *  temporary after it is deleted or goes out of scope.
    *
    * Example:
    * \code{cpp}
@@ -236,11 +240,17 @@ class device_uvector {
       cudaMemcpyAsync(element_ptr(element_index), &v, sizeof(v), cudaMemcpyDefault, s.value()));
   }
 
+  // We delete the r-value reference overload to prevent asynchronously copying from a literal or
+  // implicit temporary value after it is deleted or goes out of scope.
+  void set_element_async(std::size_t element_index,
+                         value_type const&& v,
+                         cuda_stream_view s) = delete;
+
   /**
    * @brief Returns the specified element from device memory
    *
-   * @note: This function incurs a device to host memcpy and should be used sparingly.
-   * @note: This function synchronizes `stream`.
+   * @note This function incurs a device to host memcpy and should be used sparingly.
+   * @note This function synchronizes `stream`.
    *
    * @throws rmm::out_of_range exception if `element_index >= size()`
    *
@@ -262,7 +272,8 @@ class device_uvector {
   /**
    * @brief Returns the first element.
    *
-   * @note: This function incurs a device to host memcpy and should be used sparingly.
+   * @note This function incurs a device-to-host memcpy and should be used sparingly.
+   * @note This function synchronizes `stream`.
    *
    * @throws rmm::out_of_range exception if the vector is empty.
    *
@@ -274,7 +285,8 @@ class device_uvector {
   /**
    * @brief Returns the last element.
    *
-   * @note: This function incurs a device to host memcpy and should be used sparingly.
+   * @note This function incurs a device-to-host memcpy and should be used sparingly.
+   * @note This function synchronizes `stream`.
    *
    * @throws rmm::out_of_range exception if the vector is empty.
    *
diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp
index 9fcffbd43..f3563b50f 100644
--- a/tests/device_uvector_tests.cpp
+++ b/tests/device_uvector_tests.cpp
@@ -174,8 +174,9 @@ TYPED_TEST(TypedUVectorTest, GetSetElementAsync)
   auto size = 12345;
   rmm::device_uvector<TypeParam> uv(size, this->stream());
   for (std::size_t i = 0; i < uv.size(); ++i) {
-    uv.set_element_async(i, i, this->stream());
-    EXPECT_EQ(static_cast<TypeParam>(i), uv.element(i, this->stream()));
+    auto init = static_cast<TypeParam>(i);
+    uv.set_element_async(i, init, this->stream());
+    EXPECT_EQ(init, uv.element(i, this->stream()));
   }
 }
 

From a37ba8f87edd0423ec298dfae39170f60b305067 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Wed, 10 Mar 2021 15:05:29 +1100
Subject: [PATCH 2/5] Make it invalid to pass a literal value to
 `rmm::device_scalar::set_value` and add `set_value_zero`

---
 include/rmm/device_scalar.hpp | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index 5615121c7..fca575bce 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -52,7 +52,7 @@ class device_scalar {
    * @param mr Optional, resource with which to allocate.
    */
   explicit device_scalar(
-    cuda_stream_view const &stream,
+    cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
     : buffer{sizeof(T), stream, mr}
   {
@@ -76,7 +76,7 @@ class device_scalar {
    */
   explicit device_scalar(
     T const &initial_value,
-    cuda_stream_view const &stream      = cuda_stream_view{},
+    cuda_stream_view stream             = cuda_stream_view{},
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
     : buffer{sizeof(T), stream, mr}
   {
@@ -96,7 +96,7 @@ class device_scalar {
    * @param mr The resource to use for allocating the new `device_scalar`
    */
   device_scalar(device_scalar const &other,
-                cuda_stream_view const &stream      = {},
+                cuda_stream_view stream             = {},
                 rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
     : buffer{other.buffer, stream, mr}
   {
@@ -118,7 +118,7 @@ class device_scalar {
    * @return T The value of the scalar.
    * @param stream CUDA stream on which to perform the copy and synchronize.
    */
-  T value(cuda_stream_view const &stream = cuda_stream_view{}) const
+  T value(cuda_stream_view stream = cuda_stream_view{}) const
   {
     T host_value{};
     _memcpy(&host_value, buffer.data(), stream);
@@ -161,7 +161,7 @@ class device_scalar {
    * @param stream CUDA stream on which to perform the copy
    */
   template <typename Placeholder = void>
-  auto set_value(T const &host_value, cuda_stream_view const &stream = cuda_stream_view{})
+  auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{})
     -> std::enable_if_t<std::is_fundamental<T>::value, Placeholder>
   {
     if (host_value == T{0}) {
@@ -171,6 +171,15 @@ class device_scalar {
     }
   }
 
+  void set_value(T &&host_value, cuda_stream_view stream = cuda_stream_view{}) = delete;
+
+  template <typename Placeholder = void>
+  auto set_value_zero(cuda_stream_view stream = cuda_stream_view{})
+    -> std::enable_if_t<std::is_fundamental<T>::value, Placeholder>
+  {
+    RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(T), stream.value()));
+  }
+
   /**
    * @brief Sets the value of the `device_scalar` to the given `host_value`.
    *
@@ -206,7 +215,7 @@ class device_scalar {
    * @param stream CUDA stream on which to perform the copy
    */
   template <typename Placeholder = void>
-  auto set_value(T const &host_value, cuda_stream_view const &stream = cuda_stream_view{})
+  auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{})
     -> std::enable_if_t<not std::is_fundamental<T>::value, Placeholder>
   {
     _memcpy(buffer.data(), &host_value, stream);
@@ -241,7 +250,7 @@ class device_scalar {
  private:
   rmm::device_buffer buffer{sizeof(T)};
 
-  inline void _memcpy(void *dst, const void *src, cuda_stream_view const &stream) const
+  inline void _memcpy(void *dst, const void *src, cuda_stream_view stream) const
   {
     RMM_CUDA_TRY(cudaMemcpyAsync(dst, src, sizeof(T), cudaMemcpyDefault, stream.value()));
   }

From 378a1140e0c46e331d779fe56a621a532adddaab Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 11 Mar 2021 10:07:20 +1100
Subject: [PATCH 3/5] Add optimized set_value for bools, and `set_value_zero`

---
 include/rmm/device_scalar.hpp | 107 ++++++++++++++++++++++++++++------
 1 file changed, 88 insertions(+), 19 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index fca575bce..b19cc859b 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -129,6 +129,9 @@ class device_scalar {
   /**
    * @brief Sets the value of the `device_scalar` to the given `host_value`.
    *
+   * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when
+   * `host_value` is zero.
+   *
    * @note If the stream specified to this function is different from the stream specified
    * to the constructor, then appropriate dependencies must be inserted between the streams
    * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
@@ -138,8 +141,9 @@ class device_scalar {
    * referenced by `host_value` should not be destroyed or modified until `stream` has been
    * synchronized. Otherwise, behavior is undefined.
    *
-   * @note: This function incurs a host to device memcpy and should be used sparingly.
-
+   * @note: This function incurs a host to device memcpy or device memset and should be used
+   * sparingly.
+   *
    * Example:
    * \code{cpp}
    * rmm::device_scalar<int32_t> s;
@@ -155,34 +159,70 @@ class device_scalar {
    * \endcode
    *
    * @throws `rmm::cuda_error` if copying `host_value` to device memory fails.
-   * @throws `rmm::cuda_error` if synchronizing `stream` fails.
    *
    * @param host_value The host value which will be copied to device
    * @param stream CUDA stream on which to perform the copy
    */
-  template <typename Placeholder = void>
-  auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{})
-    -> std::enable_if_t<std::is_fundamental<T>::value, Placeholder>
+  template <typename U = T>
+  auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{})
+    -> std::enable_if_t<std::is_fundamental<U>::value && not std::is_same<U, bool>::value, void>
   {
-    if (host_value == T{0}) {
-      RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(T), stream.value()));
+    if (host_value == U{0}) {
+      set_value_zero(stream);
     } else {
       _memcpy(buffer.data(), &host_value, stream);
     }
   }
 
-  void set_value(T &&host_value, cuda_stream_view stream = cuda_stream_view{}) = delete;
-
-  template <typename Placeholder = void>
-  auto set_value_zero(cuda_stream_view stream = cuda_stream_view{})
-    -> std::enable_if_t<std::is_fundamental<T>::value, Placeholder>
+  /**
+   * @brief Sets the value of the `device_scalar` to the given `host_value`.
+   *
+   * This specialization for `bool` is optimized to always use `cudaMemsetAsync`.
+   *
+   * @note If the stream specified to this function is different from the stream specified
+   * to the constructor, then appropriate dependencies must be inserted between the streams
+   * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
+   * this function, otherwise there may be a race condition.
+   *
+   * This function does not synchronize `stream` before returning. Therefore, the object
+   * referenced by `host_value` should not be destroyed or modified until `stream` has been
+   * synchronized. Otherwise, behavior is undefined.
+   *
+   * @note: This function incurs a host to device memcpy or device memset and should be used
+   * sparingly.
+   *
+   * Example:
+   * \code{cpp}
+   * rmm::device_scalar<bool> s;
+   *
+   * bool v{true};
+   *
+   * // Copies 42 to device storage on `stream`. Does _not_ synchronize
+   * vec.set_value(v, stream);
+   * ...
+   * cudaStreamSynchronize(stream);
+   * // Synchronization is required before `v` can be modified
+   * v = false;
+   * \endcode
+   *
+   * @throws `rmm::cuda_error` if the device memset fails.
+   *
+   * @param host_value The host value which the scalar will be set to (true or false)
+   * @param stream CUDA stream on which to perform the device memset
+   */
+  template <typename U = T>
+  auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{})
+    -> std::enable_if_t<std::is_same<U, bool>::value, void>
   {
-    RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(T), stream.value()));
+    RMM_CUDA_TRY(cudaMemsetAsync(
+      buffer.data(), host_value == true ? true : false, sizeof(bool), stream.value()));
   }
 
   /**
    * @brief Sets the value of the `device_scalar` to the given `host_value`.
    *
+   * Specialization for non-fundamental types.
+   *
    * @note If the stream specified to this function is different from the stream specified
    * to the constructor, then appropriate dependencies must be inserted between the streams
    * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
@@ -196,16 +236,16 @@ class device_scalar {
 
    * Example:
    * \code{cpp}
-   * rmm::device_scalar<int32_t> s;
+   * rmm::device_scalar<my_type> s;
    *
-   * int v{42};
+   * my_type v{42, "text"};
    *
    * // Copies 42 to device storage on `stream`. Does _not_ synchronize
    * vec.set_value(v, stream);
    * ...
    * cudaStreamSynchronize(stream);
    * // Synchronization is required before `v` can be modified
-   * v = 13;
+   * v.value = 21;
    * \endcode
    *
    * @throws `rmm::cuda_error` if copying `host_value` to device memory fails
@@ -214,13 +254,42 @@ class device_scalar {
    * @param host_value The host value which will be copied to device
    * @param stream CUDA stream on which to perform the copy
    */
-  template <typename Placeholder = void>
+  template <typename U = T>
   auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{})
-    -> std::enable_if_t<not std::is_fundamental<T>::value, Placeholder>
+    -> std::enable_if_t<not std::is_fundamental<U>::value, void>
   {
     _memcpy(buffer.data(), &host_value, stream);
   }
 
+  // Disallow passing literals to set_value to avoid race conditions where the memory holding the
+  // literal can be freed before the async memcpy / memset executes.
+  void set_value(T &&host_value, cuda_stream_view stream = cuda_stream_view{}) = delete;
+
+  /**
+   * @brief Sets the value of the `device_scalar` to zero.
+   *
+   * Only supported for fundamental types.
+   *
+   * @note If the stream specified to this function is different from the stream specified
+   * to the constructor, then appropriate dependencies must be inserted between the streams
+   * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
+   * this function, otherwise there may be a race condition.
+   *
+   * This function does not synchronize `stream` before returning.
+   *
+   * @note: This function incurs a device memset and should be used sparingly.
+   *
+   * @throws `rmm::cuda_error` if the device memset fails.
+   *
+   * @param stream CUDA stream on which to perform the device memset
+   */
+  template <typename U = T>
+  auto set_value_zero(cuda_stream_view stream = cuda_stream_view{})
+    -> std::enable_if_t<std::is_fundamental<U>::value, void>
+  {
+    RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(U), stream.value()));
+  }
+
   /**
    * @brief Returns pointer to object in device memory.
    *

From fded78ff6a013a2eac8b18aa80dc587575cb5ef0 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 16 Mar 2021 11:59:33 +1100
Subject: [PATCH 4/5] Simplify boolean logic and fix doc error.

---
 include/rmm/device_scalar.hpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
index b19cc859b..c54fc7937 100644
--- a/include/rmm/device_scalar.hpp
+++ b/include/rmm/device_scalar.hpp
@@ -184,12 +184,10 @@ class device_scalar {
    * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
    * this function, otherwise there may be a race condition.
    *
-   * This function does not synchronize `stream` before returning. Therefore, the object
-   * referenced by `host_value` should not be destroyed or modified until `stream` has been
-   * synchronized. Otherwise, behavior is undefined.
+   * This function does not synchronize `stream` before returning. `host_value` is passed by value
+   * so a host-side copy may be performed before calling a device memset.
    *
-   * @note: This function incurs a host to device memcpy or device memset and should be used
-   * sparingly.
+   * @note: This function incurs a device memset.
    *
    * Example:
    * \code{cpp}
@@ -197,7 +195,7 @@ class device_scalar {
    *
    * bool v{true};
    *
-   * // Copies 42 to device storage on `stream`. Does _not_ synchronize
+   * // Copies `true` to device storage on `stream`. Does _not_ synchronize
    * vec.set_value(v, stream);
    * ...
    * cudaStreamSynchronize(stream);
@@ -214,8 +212,7 @@ class device_scalar {
   auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{})
     -> std::enable_if_t<std::is_same<U, bool>::value, void>
   {
-    RMM_CUDA_TRY(cudaMemsetAsync(
-      buffer.data(), host_value == true ? true : false, sizeof(bool), stream.value()));
+    RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), host_value, sizeof(bool), stream.value()));
   }
 
   /**

From dd21418e033e913a41654dee02ff5a385d5f2c5d Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 16 Mar 2021 11:59:47 +1100
Subject: [PATCH 5/5] Expand device_scalar tests to bool and floating point
 types

---
 tests/device_scalar_tests.cpp | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/tests/device_scalar_tests.cpp b/tests/device_scalar_tests.cpp
index b9a6def97..8eaa6bbc6 100644
--- a/tests/device_scalar_tests.cpp
+++ b/tests/device_scalar_tests.cpp
@@ -25,20 +25,43 @@
 #include <chrono>
 #include <cstddef>
 #include <random>
+#include <type_traits>
 
 template <typename T>
 struct DeviceScalarTest : public ::testing::Test {
+  T value{};
   rmm::cuda_stream stream{};
   rmm::mr::device_memory_resource* mr{rmm::mr::get_current_device_resource()};
-  T value{};
   std::default_random_engine generator{};
-  std::uniform_int_distribution<T> distribution{std::numeric_limits<T>::lowest(),
-                                                std::numeric_limits<T>::max()};
 
-  DeviceScalarTest() { value = distribution(generator); }
+  DeviceScalarTest() { value = random_value(); }
+
+  template <typename U = T, std::enable_if_t<std::is_same<U, bool>::value, bool> = true>
+  U random_value()
+  {
+    static std::bernoulli_distribution distribution{};
+    return distribution(generator);
+  }
+
+  template <
+    typename U                                                                               = T,
+    std::enable_if_t<(std::is_integral<U>::value && not std::is_same<U, bool>::value), bool> = true>
+  U random_value()
+  {
+    static std::uniform_int_distribution<U> distribution{std::numeric_limits<T>::lowest(),
+                                                         std::numeric_limits<T>::max()};
+    return distribution(generator);
+  }
+
+  template <typename U = T, std::enable_if_t<std::is_floating_point<U>::value, bool> = true>
+  U random_value()
+  {
+    static std::normal_distribution<U> distribution{100, 20};
+    return distribution(generator);
+  }
 };
 
-using Types = ::testing::Types<int8_t, int16_t, int32_t, int64_t>;
+using Types = ::testing::Types<bool, int8_t, int16_t, int32_t, int64_t, float, double>;
 
 TYPED_TEST_CASE(DeviceScalarTest, Types);
 
@@ -88,7 +111,7 @@ TYPED_TEST(DeviceScalarTest, SetValue)
   rmm::device_scalar<TypeParam> scalar{this->value, this->stream, this->mr};
   EXPECT_NE(nullptr, scalar.data());
 
-  auto expected = this->distribution(this->generator);
+  auto expected = this->random_value();
 
   scalar.set_value(expected, this->stream);
   EXPECT_EQ(expected, scalar.value(this->stream));