From 15f2e92fdf3f20a3eeaf407d8bf7c572cd6f5807 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 4 Feb 2022 13:26:00 -0500
Subject: [PATCH 01/18] Add owning and non-owning semantics to
 nvbench::cuda_stream

---
 nvbench/cuda_stream.cuh | 47 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
index 0efb43b..2ed1e5e 100644
--- a/nvbench/cuda_stream.cuh
+++ b/nvbench/cuda_stream.cuh
@@ -28,19 +28,58 @@ namespace nvbench
 // RAII wrapper for a cudaStream_t.
 struct cuda_stream
 {
-  cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream)); }
-  ~cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamDestroy(m_stream)); }
+  cuda_stream()
+      : m_owning(true)
+  {
+    NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream));
+  }
+
+  cuda_stream(cuda_stream stream, bool owning)
+      : m_stream(stream)
+      , m_owning(owning)
+  {}
+
+  // destroy the stream if it's owning
+  void destroy()
+  {
+    if (m_owning)
+    {
+      NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(m_stream));
+    }
+  }
+
+  ~cuda_stream() { destroy(); }
 
   // move-only
   cuda_stream(const cuda_stream &) = delete;
-  cuda_stream(cuda_stream &&)      = default;
   cuda_stream &operator=(const cuda_stream &) = delete;
-  cuda_stream &operator=(cuda_stream &&) = default;
+
+  cuda_stream(cuda_stream &&other)
+      : m_stream(other.get_stream())
+      , m_owning(other.is_owning())
+  {
+    other.destroy();
+  }
+
+  cuda_stream &operator=(cuda_stream &&other)
+  {
+    m_stream = other.get_stream();
+    m_owning = other.is_owning();
+
+    other.destroy();
+
+    return *this;
+  }
 
   operator cudaStream_t() const { return m_stream; }
 
+  cudaStream_t get_stream() const { return m_stream; }
+
+  bool is_owning() const { return m_owning; }
+
 private:
   cudaStream_t m_stream;
+  bool m_owning;
 };
 
 } // namespace nvbench

From 8aea3e467ec0c73dd2c52ac0a961bc01b2094674 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 4 Feb 2022 13:51:30 -0500
Subject: [PATCH 02/18] Add a cuda stream member to nvbench::state

---
 nvbench/cuda_stream.cuh | 10 +++++-----
 nvbench/state.cuh       | 19 ++++++++++++++-----
 nvbench/state.cxx       |  6 ++++--
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
index 2ed1e5e..3cd2fb4 100644
--- a/nvbench/cuda_stream.cuh
+++ b/nvbench/cuda_stream.cuh
@@ -29,14 +29,14 @@ namespace nvbench
 struct cuda_stream
 {
   cuda_stream()
-      : m_owning(true)
+      : m_owning{true}
   {
     NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream));
   }
 
   cuda_stream(cuda_stream stream, bool owning)
-      : m_stream(stream)
-      , m_owning(owning)
+      : m_stream{stream}
+      , m_owning{owning}
   {}
 
   // destroy the stream if it's owning
@@ -55,8 +55,8 @@ struct cuda_stream
   cuda_stream &operator=(const cuda_stream &) = delete;
 
   cuda_stream(cuda_stream &&other)
-      : m_stream(other.get_stream())
-      , m_owning(other.is_owning())
+      : m_stream{other.get_stream()}
+      , m_owning{other.is_owning()}
   {
     other.destroy();
   }
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index 0b0b4cc..b244c0d 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -18,6 +18,7 @@
 
 #pragma once
 
+#include <nvbench/cuda_stream.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/named_values.cuh>
@@ -62,6 +63,15 @@ struct state
   state &operator=(const state &) = delete;
   state &operator=(state &&) = default;
 
+  [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const
+  {
+    return m_cuda_stream;
+  }
+  void set_cuda_stream(nvbench::cuda_stream &&stream)
+  {
+    m_cuda_stream = std::move(stream);
+  }
+
   /// The CUDA device associated with with this benchmark state. May be
   /// nullopt for CPU-only benchmarks.
   [[nodiscard]] const std::optional<nvbench::device_info> &get_device() const
@@ -259,11 +269,9 @@ struct state
 
   [[nodiscard]] bool is_cupti_required() const
   {
-    return is_l2_hit_rate_collected()
-        || is_l1_hit_rate_collected()
-        || is_stores_efficiency_collected()
-        || is_loads_efficiency_collected()
-        || is_dram_throughput_collected();
+    return is_l2_hit_rate_collected() || is_l1_hit_rate_collected() ||
+           is_stores_efficiency_collected() ||
+           is_loads_efficiency_collected() || is_dram_throughput_collected();
   }
 
   summary &add_summary(std::string summary_tag);
@@ -303,6 +311,7 @@ private:
         std::optional<nvbench::device_info> device,
         std::size_t type_config_index);
 
+  nvbench::cuda_stream m_cuda_stream;
   std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
   nvbench::named_values m_axis_values;
   std::optional<nvbench::device_info> m_device;
diff --git a/nvbench/state.cxx b/nvbench/state.cxx
index c7139ce..bdc4d8b 100644
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -33,7 +33,8 @@ namespace nvbench
 {
 
 state::state(const benchmark_base &bench)
-    : m_benchmark{bench}
+    : m_cuda_stream{}
+    , m_benchmark{bench}
     , m_run_once{bench.get_run_once()}
     , m_min_samples{bench.get_min_samples()}
     , m_min_time{bench.get_min_time()}
@@ -46,7 +47,8 @@ state::state(const benchmark_base &bench,
              nvbench::named_values values,
              std::optional<nvbench::device_info> device,
              std::size_t type_config_index)
-    : m_benchmark{bench}
+    : m_cuda_stream{}
+    , m_benchmark{bench}
     , m_axis_values{std::move(values)}
     , m_device{std::move(device)}
     , m_type_config_index{type_config_index}

From c510a0e78c240be31adc87474e3ec59f56571602 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 4 Feb 2022 13:56:02 -0500
Subject: [PATCH 03/18] Update launch to hold a const ref of
 nvbenc::cuda_stream

---
 nvbench/launch.cuh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/nvbench/launch.cuh b/nvbench/launch.cuh
index fa1bf95..5479462 100644
--- a/nvbench/launch.cuh
+++ b/nvbench/launch.cuh
@@ -25,8 +25,11 @@ namespace nvbench
 
 struct launch
 {
+  explicit launch(const nvbench::cuda_stream &stream)
+      : m_stream{stream}
+  {}
+
   // move-only
-  launch()               = default;
   launch(const launch &) = delete;
   launch(launch &&)      = default;
   launch &operator=(const launch &) = delete;
@@ -38,7 +41,7 @@ struct launch
   };
 
 private:
-  nvbench::cuda_stream m_stream;
+  const nvbench::cuda_stream &m_stream;
 };
 
 } // namespace nvbench

From 14eab0774a53b82996305f5963a8da54320174d9 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 4 Feb 2022 14:16:43 -0500
Subject: [PATCH 04/18] Update measure_* classes to construct launch from the
 state cuda stream

---
 nvbench/detail/measure_cold.cu  | 1 +
 nvbench/detail/measure_cupti.cu | 7 ++++++-
 nvbench/detail/measure_hot.cu   | 1 +
 nvbench/state.cxx               | 6 ++----
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index f8f5309..7014064 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -39,6 +39,7 @@ namespace nvbench::detail
 
 measure_cold_base::measure_cold_base(state &exec_state)
     : m_state{exec_state}
+    , m_launch{m_state.get_cuda_stream()}
     , m_run_once{exec_state.get_run_once()}
     , m_min_samples{exec_state.get_min_samples()}
     , m_max_noise{exec_state.get_max_noise()}
diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu
index b23a486..1a09f90 100644
--- a/nvbench/detail/measure_cupti.cu
+++ b/nvbench/detail/measure_cupti.cu
@@ -169,7 +169,12 @@ std::vector<std::string> add_metrics(nvbench::state &state)
 } // namespace
 
 measure_cupti_base::measure_cupti_base(state &exec_state)
-try : m_state{exec_state}, m_cupti(*m_state.get_device(), add_metrics(m_state))
+try : m_state
+{
+  exec_state
+}
+, m_launch{m_state.get_cuda_stream()},
+  m_cupti{*m_state.get_device(), add_metrics(m_state)}
 {}
 catch (const std::exception &ex)
 {
diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu
index d036d2e..0d4100e 100644
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -37,6 +37,7 @@ namespace nvbench::detail
 
 measure_hot_base::measure_hot_base(state &exec_state)
     : m_state{exec_state}
+    , m_launch{m_state.get_cuda_stream()}
     , m_min_samples{exec_state.get_min_samples()}
     , m_min_time{exec_state.get_min_time()}
     , m_skip_time{exec_state.get_skip_time()}
diff --git a/nvbench/state.cxx b/nvbench/state.cxx
index bdc4d8b..c7139ce 100644
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -33,8 +33,7 @@ namespace nvbench
 {
 
 state::state(const benchmark_base &bench)
-    : m_cuda_stream{}
-    , m_benchmark{bench}
+    : m_benchmark{bench}
     , m_run_once{bench.get_run_once()}
     , m_min_samples{bench.get_min_samples()}
     , m_min_time{bench.get_min_time()}
@@ -47,8 +46,7 @@ state::state(const benchmark_base &bench,
              nvbench::named_values values,
              std::optional<nvbench::device_info> device,
              std::size_t type_config_index)
-    : m_cuda_stream{}
-    , m_benchmark{bench}
+    : m_benchmark{bench}
     , m_axis_values{std::move(values)}
     , m_device{std::move(device)}
     , m_type_config_index{type_config_index}

From 86708ec793fa582669f60fad4cd17d86b10533b0 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 4 Feb 2022 16:03:52 -0500
Subject: [PATCH 05/18] Fix a stream destroy bug

---
 nvbench/cuda_stream.cuh | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
index 3cd2fb4..d4c4b5e 100644
--- a/nvbench/cuda_stream.cuh
+++ b/nvbench/cuda_stream.cuh
@@ -34,7 +34,7 @@ struct cuda_stream
     NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream));
   }
 
-  cuda_stream(cuda_stream stream, bool owning)
+  cuda_stream(cudaStream_t stream, bool owning)
       : m_stream{stream}
       , m_owning{owning}
   {}
@@ -42,7 +42,7 @@ struct cuda_stream
   // destroy the stream if it's owning
   void destroy()
   {
-    if (m_owning)
+    if (m_owning and m_stream != cudaStreamDefault)
     {
       NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(m_stream));
     }
@@ -56,16 +56,24 @@ struct cuda_stream
 
   cuda_stream(cuda_stream &&other)
       : m_stream{other.get_stream()}
-      , m_owning{other.is_owning()}
+      , m_owning{other.get_owning()}
   {
+    if (m_owning)
+    {
+      other.set_owning(not m_owning);
+    }
     other.destroy();
   }
 
   cuda_stream &operator=(cuda_stream &&other)
   {
     m_stream = other.get_stream();
-    m_owning = other.is_owning();
+    m_owning = other.get_owning();
 
+    if (m_owning)
+    {
+      other.set_owning(not m_owning);
+    }
     other.destroy();
 
     return *this;
@@ -75,7 +83,8 @@ struct cuda_stream
 
   cudaStream_t get_stream() const { return m_stream; }
 
-  bool is_owning() const { return m_owning; }
+  [[nodiscard]] bool get_owning() const { return m_owning; }
+  void set_owning(bool b) { m_owning = b; }
 
 private:
   cudaStream_t m_stream;

From 439ffec1c8287067f0309b7b455f08182f83077a Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 4 Feb 2022 16:35:55 -0500
Subject: [PATCH 06/18] Minor correction

---
 nvbench/cuda_stream.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
index d4c4b5e..0be0362 100644
--- a/nvbench/cuda_stream.cuh
+++ b/nvbench/cuda_stream.cuh
@@ -42,7 +42,7 @@ struct cuda_stream
   // destroy the stream if it's owning
   void destroy()
   {
-    if (m_owning and m_stream != cudaStreamDefault)
+    if (m_owning)
     {
       NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(m_stream));
     }

From 470beda9f047c5cad55f0a1cc96ec7c1127cc07b Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 4 Feb 2022 16:55:29 -0500
Subject: [PATCH 07/18] Add nvbench::state stream tests

---
 testing/state.cu | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/testing/state.cu b/testing/state.cu
index 58e8dd1..ade63ed 100644
--- a/testing/state.cu
+++ b/testing/state.cu
@@ -51,6 +51,23 @@ struct state_tester : public nvbench::state
 
 using nvbench::detail::state_tester;
 
+void test_streams()
+{
+  dummy_bench bench;
+
+  state_tester state{bench};
+
+  // Test non-owning stream
+  state.set_cuda_stream(nvbench::cuda_stream{cudaStreamDefault, false});
+  ASSERT(state.get_cuda_stream() == cudaStreamDefault);
+
+  // Test owning stream
+  auto stream = nvbench::cuda_stream{};
+  auto gold   = stream.get_stream();
+  state.set_cuda_stream(std::move(stream));
+  ASSERT(state.get_cuda_stream() == gold);
+}
+
 void test_params()
 {
   dummy_bench bench;
@@ -110,6 +127,7 @@ void test_defaults()
 
 int main()
 {
+  test_streams();
   test_params();
   test_summaries();
   test_defaults();

From 76cbbcc8f92ce0b8f8c01b5165d7ae456e262fe8 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <wangyunsong89@gmail.com>
Date: Fri, 4 Feb 2022 17:20:40 -0500
Subject: [PATCH 08/18] Update benchmarks.md

---
 docs/benchmarks.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 09820f9..5ec0c22 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -11,6 +11,18 @@ void my_benchmark(nvbench::state& state) {
 NVBENCH_BENCH(my_benchmark);
 ```
 
+The following example shows how to benchmark functions that do not expose stream parameters:
+```cpp
+void my_benchmark(nvbench::state& state) {
+  state.set_cuda_stream(nvbench::cuda_stream{cudaStreamDefault, false});
+  state.exec([](nvbench::launch&) {
+    my_func(); // a host API invoking GPU kernels without taking an explicit stream
+    my_kernel<<<num_blocks, 256>>>(); // or a kernel launched with the default stream
+  });
+}
+NVBENCH_BENCH(my_benchmark);
+```
+
 There are three main components in the definition of a benchmark:
 
 - A `KernelGenerator` callable (`my_benchmark` above)

From 33a896f99eab7001efead4f23132978d56b43c69 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 4 Feb 2022 17:25:50 -0500
Subject: [PATCH 09/18] Update copyright year

---
 nvbench/cuda_stream.cuh         | 2 +-
 nvbench/detail/measure_cold.cu  | 2 +-
 nvbench/detail/measure_cupti.cu | 2 +-
 nvbench/detail/measure_hot.cu   | 2 +-
 nvbench/launch.cuh              | 2 +-
 nvbench/state.cuh               | 2 +-
 testing/state.cu                | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
index 0be0362..dfa9bb0 100644
--- a/nvbench/cuda_stream.cuh
+++ b/nvbench/cuda_stream.cuh
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index 7014064..ec3bbbe 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu
index 1a09f90..75318ae 100644
--- a/nvbench/detail/measure_cupti.cu
+++ b/nvbench/detail/measure_cupti.cu
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu
index 0d4100e..25e2119 100644
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
diff --git a/nvbench/launch.cuh b/nvbench/launch.cuh
index 5479462..fae3f0e 100644
--- a/nvbench/launch.cuh
+++ b/nvbench/launch.cuh
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index b244c0d..2369bba 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
diff --git a/testing/state.cu b/testing/state.cu
index ade63ed..de87c81 100644
--- a/testing/state.cu
+++ b/testing/state.cu
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with

From a2a12c689c3b15cec0f3bd5b05a25805b4103a0a Mon Sep 17 00:00:00 2001
From: Yunsong Wang <wangyunsong89@gmail.com>
Date: Sun, 6 Feb 2022 19:31:20 -0500
Subject: [PATCH 10/18] Update docs/benchmarks.md

Co-authored-by: Jake Hemstad <jhemstad@nvidia.com>
---
 docs/benchmarks.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 5ec0c22..5a16aec 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -11,7 +11,13 @@ void my_benchmark(nvbench::state& state) {
 NVBENCH_BENCH(my_benchmark);
 ```
 
-The following example shows how to benchmark functions that do not expose stream parameters:
+NVBench records the elapsed time of work on a CUDA stream for each iteration of a benchmark. 
+By default, NVBench creates and provides an explicit stream via `launch::get_stream()` 
+to pass to every stream-ordered operation. 
+
+Sometimes it is inconvenient or impossible to specify an explicit CUDA stream to every 
+stream-ordered operation. A `cudaStream_t` may be provided via `state::set_cuda_stream`. 
+It is assumed that all work of interest executes on or synchronizes with this stream. 
 ```cpp
 void my_benchmark(nvbench::state& state) {
   state.set_cuda_stream(nvbench::cuda_stream{cudaStreamDefault, false});

From e7c29c1c1b47fde4b03febd2d56db166348c2f77 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sun, 6 Feb 2022 19:34:57 -0500
Subject: [PATCH 11/18] Update docs

---
 docs/benchmarks.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 5a16aec..351a922 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -20,7 +20,8 @@ stream-ordered operation. A `cudaStream_t` may be provided via `state::set_cuda_
 It is assumed that all work of interest executes on or synchronizes with this stream. 
 ```cpp
 void my_benchmark(nvbench::state& state) {
-  state.set_cuda_stream(nvbench::cuda_stream{cudaStreamDefault, false});
+  cudaStream_t default_stream = 0;
+  state.set_cuda_stream(nvbench::cuda_stream{default_stream, false});
   state.exec([](nvbench::launch&) {
     my_func(); // a host API invoking GPU kernels without taking an explicit stream
     my_kernel<<<num_blocks, 256>>>(); // or a kernel launched with the default stream

From e05bf002f71356c14c6288381f5c44e891d20768 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sun, 6 Feb 2022 20:14:41 -0500
Subject: [PATCH 12/18] Use unique_ptr + custom deleter to simplify destroy
 logic

---
 nvbench/cuda_stream.cuh | 72 ++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 45 deletions(-)

diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
index dfa9bb0..1c71372 100644
--- a/nvbench/cuda_stream.cuh
+++ b/nvbench/cuda_stream.cuh
@@ -22,6 +22,8 @@
 
 #include <cuda_runtime_api.h>
 
+#include <memory>
+
 namespace nvbench
 {
 
@@ -29,66 +31,46 @@ namespace nvbench
 struct cuda_stream
 {
   cuda_stream()
-      : m_owning{true}
-  {
-    NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream));
-  }
+      : m_stream{[]() {
+                   cudaStream_t s;
+                   NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
+                   return s;
+                 }(),
+                 stream_deleter{true}}
+  {}
 
   cuda_stream(cudaStream_t stream, bool owning)
-      : m_stream{stream}
-      , m_owning{owning}
+      : m_stream{stream, stream_deleter{owning}}
   {}
 
-  // destroy the stream if it's owning
-  void destroy()
-  {
-    if (m_owning)
-    {
-      NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(m_stream));
-    }
-  }
-
-  ~cuda_stream() { destroy(); }
+  ~cuda_stream() = default;
 
   // move-only
   cuda_stream(const cuda_stream &) = delete;
   cuda_stream &operator=(const cuda_stream &) = delete;
+  cuda_stream(cuda_stream &&)                 = default;
+  cuda_stream &operator=(cuda_stream &&) = default;
 
-  cuda_stream(cuda_stream &&other)
-      : m_stream{other.get_stream()}
-      , m_owning{other.get_owning()}
-  {
-    if (m_owning)
-    {
-      other.set_owning(not m_owning);
-    }
-    other.destroy();
-  }
+  operator cudaStream_t() const { return m_stream.get(); }
+
+  cudaStream_t get_stream() const { return m_stream.get(); }
 
-  cuda_stream &operator=(cuda_stream &&other)
+private:
+  struct stream_deleter
   {
-    m_stream = other.get_stream();
-    m_owning = other.get_owning();
+    using pointer = cudaStream_t;
+    bool owning;
 
-    if (m_owning)
+    constexpr void operator()(pointer s) const noexcept
     {
-      other.set_owning(not m_owning);
+      if (owning)
+      {
+        NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(s));
+      }
     }
-    other.destroy();
-
-    return *this;
-  }
+  };
 
-  operator cudaStream_t() const { return m_stream; }
-
-  cudaStream_t get_stream() const { return m_stream; }
-
-  [[nodiscard]] bool get_owning() const { return m_owning; }
-  void set_owning(bool b) { m_owning = b; }
-
-private:
-  cudaStream_t m_stream;
-  bool m_owning;
+  std::unique_ptr<cudaStream_t, stream_deleter> m_stream;
 };
 
 } // namespace nvbench

From 6159d9c6cb5bff31105c77bb8241c6fc3450022b Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sun, 6 Feb 2022 20:19:21 -0500
Subject: [PATCH 13/18] Minor correction in unit test

---
 testing/state.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/testing/state.cu b/testing/state.cu
index de87c81..a2300d1 100644
--- a/testing/state.cu
+++ b/testing/state.cu
@@ -58,8 +58,9 @@ void test_streams()
   state_tester state{bench};
 
   // Test non-owning stream
-  state.set_cuda_stream(nvbench::cuda_stream{cudaStreamDefault, false});
-  ASSERT(state.get_cuda_stream() == cudaStreamDefault);
+  cudaStream_t default_stream = 0;
+  state.set_cuda_stream(nvbench::cuda_stream{default_stream, false});
+  ASSERT(state.get_cuda_stream() == default_stream);
 
   // Test owning stream
   auto stream = nvbench::cuda_stream{};

From fde2e408de13e014103a4f836725be8b8ecc6e57 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 7 Feb 2022 13:09:35 -0500
Subject: [PATCH 14/18] Add stream benchmark example

---
 README.md               |  1 +
 examples/CMakeLists.txt |  1 +
 examples/stream.cu      | 62 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+)
 create mode 100644 examples/stream.cu

diff --git a/README.md b/README.md
index a7240ec..28c5568 100644
--- a/README.md
+++ b/README.md
@@ -60,6 +60,7 @@ various NVBench features and usecases:
 - [Enums and compile-time-constant-integral parameter axes](examples/enums.cu)
 - [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
 - [Skipping benchmark configurations](examples/skip.cu)
+- [Benchmarking on a specific stream](examples/stream.cu)
 - [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu)
 - [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_timer.cu)
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a6adc80..4532bfe 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -4,6 +4,7 @@ set(example_srcs
   exec_tag_sync.cu
   exec_tag_timer.cu
   skip.cu
+  stream.cu
   throughput.cu
   auto_throughput.cu
 )
diff --git a/examples/stream.cu b/examples/stream.cu
new file mode 100644
index 0000000..d0ca0c8
--- /dev/null
+++ b/examples/stream.cu
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2022 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Grab some testing kernels from NVBench:
+#include <nvbench/test_kernels.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+// A function to benchmark but does not expose an explicit stream argument.
+void copy(int32_t *input, int32_t *output, std::size_t const num_values)
+{
+  nvbench::copy_kernel<<<256, 256>>>(input, output, num_values);
+}
+
+// `stream_bench` copies a 64 MiB buffer of int32_t on a CUDA stream specified
+// by the user.
+//
+// By default, NVBench creates and provides an explicit stream via
+// `launch::get_stream()` to pass to every stream-ordered operation. Sometimes
+// it is inconvenient or impossible to specify an explicit CUDA stream to every
+// stream-ordered operation. In this case, users may specify a target stream via
+// `state::set_cuda_stream`. It is assumed that all work of interest executes on
+// or synchronizes with this stream.
+void stream_bench(nvbench::state &state)
+{
+  // Allocate input data:
+  const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
+  thrust::device_vector<nvbench::int32_t> input(num_values);
+  thrust::device_vector<nvbench::int32_t> output(num_values);
+
+  // Set CUDA default stream as the target stream. Note the default stream
+  // is non-owning.
+  cudaStream_t default_stream = 0;
+  state.set_cuda_stream(
+    nvbench::cuda_stream{default_stream, false /*owning = false*/});
+
+  state.exec([&input, &output, num_values](nvbench::launch &) {
+    copy(thrust::raw_pointer_cast(input.data()),
+         thrust::raw_pointer_cast(output.data()),
+         num_values);
+  });
+}
+
+NVBENCH_BENCH(stream_bench);

From da2ec38cdb0403933ee63136969cc2f5506ffd26 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Fri, 11 Feb 2022 13:20:05 -0500
Subject: [PATCH 15/18] Exclude some bits from clang-format.

---
 nvbench/detail/measure_cupti.cu | 13 +++++++------
 nvbench/state.cuh               |  8 ++++++--
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu
index 75318ae..9e8de6c 100644
--- a/nvbench/detail/measure_cupti.cu
+++ b/nvbench/detail/measure_cupti.cu
@@ -169,13 +169,14 @@ std::vector<std::string> add_metrics(nvbench::state &state)
 } // namespace
 
 measure_cupti_base::measure_cupti_base(state &exec_state)
-try : m_state
-{
-  exec_state
-}
-, m_launch{m_state.get_cuda_stream()},
-  m_cupti{*m_state.get_device(), add_metrics(m_state)}
+// clang-format off
+// (formatter doesn't handle `try :` very well...)
+try
+  : m_state{exec_state}
+  , m_launch{m_state.get_cuda_stream()}
+  , m_cupti{*m_state.get_device(), add_metrics(m_state)}
 {}
+// clang-format on
 catch (const std::exception &ex)
 {
   if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index 2369bba..afb9a45 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -269,9 +269,13 @@ struct state
 
   [[nodiscard]] bool is_cupti_required() const
   {
-    return is_l2_hit_rate_collected() || is_l1_hit_rate_collected() ||
+    // clang-format off
+    return is_l2_hit_rate_collected() ||
+           is_l1_hit_rate_collected() ||
            is_stores_efficiency_collected() ||
-           is_loads_efficiency_collected() || is_dram_throughput_collected();
+           is_loads_efficiency_collected() ||
+           is_dram_throughput_collected();
+    // clang-format on
   }
 
   summary &add_summary(std::string summary_tag);

From 8ae58981ca6e65a439a07a4086ea77f1d633a889 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Fri, 11 Feb 2022 13:25:41 -0500
Subject: [PATCH 16/18] Add docs for launch and cuda_stream.

---
 nvbench/cuda_stream.cuh | 23 ++++++++++++++++++++++-
 nvbench/launch.cuh      | 18 ++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
index 1c71372..f8f4a52 100644
--- a/nvbench/cuda_stream.cuh
+++ b/nvbench/cuda_stream.cuh
@@ -27,9 +27,19 @@
 namespace nvbench
 {
 
-// RAII wrapper for a cudaStream_t.
+/**
+ * Manages and provides access to a CUDA stream.
+ *
+ * May be owning or non-owning. If the stream is owned, it will be freed with
+ * `cudaStreamDestroy` when the `cuda_stream`'s lifetime ends. Non-owning
+ * `cuda_stream`s are sometimes referred to as views.
+ */
 struct cuda_stream
 {
+  /**
+   * Constructs a cuda_stream that owns a new stream, created with
+   * `cudaStreamCreate`.
+   */
   cuda_stream()
       : m_stream{[]() {
                    cudaStream_t s;
@@ -39,6 +49,12 @@ struct cuda_stream
                  stream_deleter{true}}
   {}
 
+  /**
+   * Constructs a `cuda_stream` from an explicit cudaStream_t.
+   *
+   * @param owning If true, `cudaStreamCreate(stream)` will be called from this
+   * `cuda_stream`'s destructor.
+   */
   cuda_stream(cudaStream_t stream, bool owning)
       : m_stream{stream, stream_deleter{owning}}
   {}
@@ -51,9 +67,14 @@ struct cuda_stream
   cuda_stream(cuda_stream &&)                 = default;
   cuda_stream &operator=(cuda_stream &&) = default;
 
+  /**
+   * @return The `cudaStream_t` managed by this `cuda_stream`.
+   * @{
+   */
   operator cudaStream_t() const { return m_stream.get(); }
 
   cudaStream_t get_stream() const { return m_stream.get(); }
+  /**@}*/
 
 private:
   struct stream_deleter
diff --git a/nvbench/launch.cuh b/nvbench/launch.cuh
index fae3f0e..4b973f3 100644
--- a/nvbench/launch.cuh
+++ b/nvbench/launch.cuh
@@ -23,6 +23,18 @@
 namespace nvbench
 {
 
+/**
+ * Configuration object used to communicate with a `KernelLauncher`.
+ *
+ * The `KernelLauncher` passed into `nvbench::state::exec` is required to
+ * accept an `nvbench::launch` argument:
+ *
+ * ```cpp
+ * state.exec([](nvbench::launch &launch) {
+ *   kernel<<<M, N, 0, launch.get_stream()>>>();
+ * }
+ * ```
+ */
 struct launch
 {
   explicit launch(const nvbench::cuda_stream &stream)
@@ -35,12 +47,18 @@ struct launch
   launch &operator=(const launch &) = delete;
   launch &operator=(launch &&) = default;
 
+  /**
+   * @return a CUDA stream that all kernels and other stream-ordered CUDA work
+   * must use. This stream can be changed by the `KernelGenerator` using the
+   * `nvbench::state::set_cuda_stream` method.
+   */
   __forceinline__ const nvbench::cuda_stream &get_stream() const
   {
     return m_stream;
   };
 
 private:
+  // The stream is owned by the `nvbench::state` associated with this launch.
   const nvbench::cuda_stream &m_stream;
 };
 

From 3b4138763736d3cf184c6239bd26966e51c8fbbc Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Fri, 11 Feb 2022 13:26:10 -0500
Subject: [PATCH 17/18] Add `nvbench::make_cuda_stream_view(cudaStream_t)`.

---
 examples/stream.cu      |  4 +---
 nvbench/cuda_stream.cuh | 12 ++++++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/examples/stream.cu b/examples/stream.cu
index d0ca0c8..9507558 100644
--- a/examples/stream.cu
+++ b/examples/stream.cu
@@ -49,8 +49,7 @@ void stream_bench(nvbench::state &state)
   // Set CUDA default stream as the target stream. Note the default stream
   // is non-owning.
   cudaStream_t default_stream = 0;
-  state.set_cuda_stream(
-    nvbench::cuda_stream{default_stream, false /*owning = false*/});
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
 
   state.exec([&input, &output, num_values](nvbench::launch &) {
     copy(thrust::raw_pointer_cast(input.data()),
@@ -58,5 +57,4 @@ void stream_bench(nvbench::state &state)
          num_values);
   });
 }
-
 NVBENCH_BENCH(stream_bench);
diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
index f8f4a52..6674c27 100644
--- a/nvbench/cuda_stream.cuh
+++ b/nvbench/cuda_stream.cuh
@@ -33,6 +33,8 @@ namespace nvbench
  * May be owning or non-owning. If the stream is owned, it will be freed with
  * `cudaStreamDestroy` when the `cuda_stream`'s lifetime ends. Non-owning
  * `cuda_stream`s are sometimes referred to as views.
+ *
+ * @sa nvbench::make_cuda_stream_view
  */
 struct cuda_stream
 {
@@ -54,6 +56,8 @@ struct cuda_stream
    *
    * @param owning If true, `cudaStreamCreate(stream)` will be called from this
    * `cuda_stream`'s destructor.
+   *
+   * @sa nvbench::make_cuda_stream_view
    */
   cuda_stream(cudaStream_t stream, bool owning)
       : m_stream{stream, stream_deleter{owning}}
@@ -94,4 +98,12 @@ private:
   std::unique_ptr<cudaStream_t, stream_deleter> m_stream;
 };
 
+/**
+ * Creates a non-owning view of the specified `stream`.
+ */
+inline nvbench::cuda_stream make_cuda_stream_view(cudaStream_t stream)
+{
+  return cuda_stream{stream, false};
+}
+
 } // namespace nvbench

From 039d45572793503541d30a5c985824fd38e1b9aa Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Fri, 11 Feb 2022 13:29:06 -0500
Subject: [PATCH 18/18] Move documentation on streams to new subsection.

Also update to use `nvbench::make_cuda_stream_view`.
---
 docs/benchmarks.md | 49 ++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 351a922..840fc4e 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -11,25 +11,6 @@ void my_benchmark(nvbench::state& state) {
 NVBENCH_BENCH(my_benchmark);
 ```
 
-NVBench records the elapsed time of work on a CUDA stream for each iteration of a benchmark. 
-By default, NVBench creates and provides an explicit stream via `launch::get_stream()` 
-to pass to every stream-ordered operation. 
-
-Sometimes it is inconvenient or impossible to specify an explicit CUDA stream to every 
-stream-ordered operation. A `cudaStream_t` may be provided via `state::set_cuda_stream`. 
-It is assumed that all work of interest executes on or synchronizes with this stream. 
-```cpp
-void my_benchmark(nvbench::state& state) {
-  cudaStream_t default_stream = 0;
-  state.set_cuda_stream(nvbench::cuda_stream{default_stream, false});
-  state.exec([](nvbench::launch&) {
-    my_func(); // a host API invoking GPU kernels without taking an explicit stream
-    my_kernel<<<num_blocks, 256>>>(); // or a kernel launched with the default stream
-  });
-}
-NVBENCH_BENCH(my_benchmark);
-```
-
 There are three main components in the definition of a benchmark:
 
 - A `KernelGenerator` callable (`my_benchmark` above)
@@ -60,6 +41,36 @@ attributes to be modified.
 NVBENCH_BENCH(my_benchmark).set_name("my_kernel<<<num_blocks, 256>>>");
 ```
 
+# CUDA Streams
+
+NVBench records GPU execution times on a specific CUDA stream. By default, a new
+stream is created and passed to the `KernelLauncher` via the
+`nvbench::launch::get_stream()` method, as shown in
+[Minimal Benchmark](#minimal-benchmark). All benchmarked kernels and other
+stream-ordered work must be launched on this stream for NVBench to capture it.
+
+In some instances, it may be inconvenient or impossible to specify an explicit
+CUDA stream for the benchmarked operation to use. For example, a library may
+manage and use its own streams, or an opaque API may always launch work on the
+default stream. In these situations, users may provide NVBench with an explicit
+stream via `nvbench::state::set_cuda_stream` and `nvbench::make_stream_view`.
+It is assumed that all work of interest executes on or synchronizes with this
+stream.
+
+```cpp
+void my_benchmark(nvbench::state& state) {
+  cudaStream_t default_stream = 0;
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
+  state.exec([](nvbench::launch&) {
+    my_func(); // a host API invoking GPU kernels on the default stream
+    my_kernel<<<num_blocks, 256>>>(); // or a kernel launched with the default stream
+  });
+}
+NVBENCH_BENCH(my_benchmark);
+```
+
+A full example can be found in [examples/stream.cu](../examples/stream.cu).
+
 # Parameter Axes
 
 Some kernels will be used with a variety of options, input data types/sizes, and