NVIDIA · alliepiper · Feb 11, 2022 · Feb 4, 2022 · Feb 4, 2022 · Feb 4, 2022
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
@@ -11,6 +11,18 @@ void my_benchmark(nvbench::state& state) {
 NVBENCH_BENCH(my_benchmark);
 ```
 
+The following example shows how to benchmark functions that do not expose stream parameters:
+```cpp
+void my_benchmark(nvbench::state& state) {
+  state.set_cuda_stream(nvbench::cuda_stream{cudaStreamDefault, false});
+  state.exec([](nvbench::launch&) {
+    my_func(); // a host API invoking GPU kernels without taking an explicit stream
+    my_kernel<<<num_blocks, 256>>>(); // or a kernel launched with the default stream
+  });
+}
+NVBENCH_BENCH(my_benchmark);
+```
+
 There are three main components in the definition of a benchmark:
 
 - A `KernelGenerator` callable (`my_benchmark` above)

diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
@@ -28,19 +28,67 @@ namespace nvbench
 // RAII wrapper for a cudaStream_t.
 struct cuda_stream
 {
-  cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream)); }
-  ~cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamDestroy(m_stream)); }
+  cuda_stream()
+      : m_owning{true}
+  {
+    NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream));
+  }
+
+  cuda_stream(cudaStream_t stream, bool owning)
+      : m_stream{stream}
+      , m_owning{owning}
+  {}
+
+  // destroy the stream if it's owning
+  void destroy()
+  {
+    if (m_owning)
+    {
+      NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(m_stream));
+    }
+  }
+
+  ~cuda_stream() { destroy(); }
 
   // move-only
   cuda_stream(const cuda_stream &) = delete;
-  cuda_stream(cuda_stream &&)      = default;
   cuda_stream &operator=(const cuda_stream &) = delete;
-  cuda_stream &operator=(cuda_stream &&) = default;
+
+  cuda_stream(cuda_stream &&other)
+      : m_stream{other.get_stream()}
+      , m_owning{other.get_owning()}
+  {
+    if (m_owning)
+    {
+      other.set_owning(not m_owning);
+    }
+    other.destroy();
+  }
+
+  cuda_stream &operator=(cuda_stream &&other)
+  {
+    m_stream = other.get_stream();
+    m_owning = other.get_owning();
+
+    if (m_owning)
+    {
+      other.set_owning(not m_owning);
+    }
+    other.destroy();
+
+    return *this;
+  }
 
   operator cudaStream_t() const { return m_stream; }
 
+  cudaStream_t get_stream() const { return m_stream; }
+
+  [[nodiscard]] bool get_owning() const { return m_owning; }
+  void set_owning(bool b) { m_owning = b; }
+
 private:
   cudaStream_t m_stream;
+  bool m_owning;
 };
 
 } // namespace nvbench
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
@@ -39,6 +39,7 @@ namespace nvbench::detail
 
 measure_cold_base::measure_cold_base(state &exec_state)
     : m_state{exec_state}
+    , m_launch{m_state.get_cuda_stream()}
     , m_run_once{exec_state.get_run_once()}
     , m_min_samples{exec_state.get_min_samples()}
     , m_max_noise{exec_state.get_max_noise()}

diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
@@ -169,7 +169,12 @@ std::vector<std::string> add_metrics(nvbench::state &state)
 } // namespace
 
 measure_cupti_base::measure_cupti_base(state &exec_state)
-try : m_state{exec_state}, m_cupti(*m_state.get_device(), add_metrics(m_state))
+try : m_state
+{
+  exec_state
+}
+, m_launch{m_state.get_cuda_stream()},
+  m_cupti{*m_state.get_device(), add_metrics(m_state)}
 {}
 catch (const std::exception &ex)
 {

diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
@@ -37,6 +37,7 @@ namespace nvbench::detail
 
 measure_hot_base::measure_hot_base(state &exec_state)
     : m_state{exec_state}
+    , m_launch{m_state.get_cuda_stream()}
     , m_min_samples{exec_state.get_min_samples()}
     , m_min_time{exec_state.get_min_time()}
     , m_skip_time{exec_state.get_skip_time()}

diff --git a/nvbench/launch.cuh b/nvbench/launch.cuh
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
@@ -25,8 +25,11 @@ namespace nvbench
 
 struct launch
 {
+  explicit launch(const nvbench::cuda_stream &stream)
+      : m_stream{stream}
+  {}
+
   // move-only
-  launch()               = default;
   launch(const launch &) = delete;
   launch(launch &&)      = default;
   launch &operator=(const launch &) = delete;
@@ -38,7 +41,7 @@ struct launch
   };
 
 private:
-  nvbench::cuda_stream m_stream;
+  const nvbench::cuda_stream &m_stream;
 };
 
 } // namespace nvbench
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
@@ -18,6 +18,7 @@
 
 #pragma once
 
+#include <nvbench/cuda_stream.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/named_values.cuh>
@@ -62,6 +63,15 @@ struct state
   state &operator=(const state &) = delete;
   state &operator=(state &&) = default;
 
+  [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const
+  {
+    return m_cuda_stream;
+  }
+  void set_cuda_stream(nvbench::cuda_stream &&stream)
+  {
+    m_cuda_stream = std::move(stream);
+  }
+
   /// The CUDA device associated with with this benchmark state. May be
   /// nullopt for CPU-only benchmarks.
   [[nodiscard]] const std::optional<nvbench::device_info> &get_device() const
@@ -259,11 +269,9 @@ struct state
 
   [[nodiscard]] bool is_cupti_required() const
   {
-    return is_l2_hit_rate_collected()
-        || is_l1_hit_rate_collected()
-        || is_stores_efficiency_collected()
-        || is_loads_efficiency_collected()
-        || is_dram_throughput_collected();
+    return is_l2_hit_rate_collected() || is_l1_hit_rate_collected() ||
+           is_stores_efficiency_collected() ||
+           is_loads_efficiency_collected() || is_dram_throughput_collected();
   }
 
   summary &add_summary(std::string summary_tag);
@@ -303,6 +311,7 @@ private:
         std::optional<nvbench::device_info> device,
         std::size_t type_config_index);
 
+  nvbench::cuda_stream m_cuda_stream;
   std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
   nvbench::named_values m_axis_values;
   std::optional<nvbench::device_info> m_device;

diff --git a/testing/state.cu b/testing/state.cu
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 with the LLVM exception
  *  (the "License"); you may not use this file except in compliance with
@@ -51,6 +51,23 @@ struct state_tester : public nvbench::state
 
 using nvbench::detail::state_tester;
 
+void test_streams()
+{
+  dummy_bench bench;
+
+  state_tester state{bench};
+
+  // Test non-owning stream
+  state.set_cuda_stream(nvbench::cuda_stream{cudaStreamDefault, false});
+  ASSERT(state.get_cuda_stream() == cudaStreamDefault);
+
+  // Test owning stream
+  auto stream = nvbench::cuda_stream{};
+  auto gold   = stream.get_stream();
+  state.set_cuda_stream(std::move(stream));
+  ASSERT(state.get_cuda_stream() == gold);
+}
+
 void test_params()
 {
   dummy_bench bench;
@@ -110,6 +127,7 @@ void test_defaults()
 
 int main()
 {
+  test_streams();
   test_params();
   test_summaries();
   test_defaults();