From 15f2e92fdf3f20a3eeaf407d8bf7c572cd6f5807 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 4 Feb 2022 13:26:00 -0500 Subject: [PATCH 01/18] Add owning and non-owning semantics to nvbench::cuda_stream --- nvbench/cuda_stream.cuh | 47 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh index 0efb43b..2ed1e5e 100644 --- a/nvbench/cuda_stream.cuh +++ b/nvbench/cuda_stream.cuh @@ -28,19 +28,58 @@ namespace nvbench // RAII wrapper for a cudaStream_t. struct cuda_stream { - cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream)); } - ~cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamDestroy(m_stream)); } + cuda_stream() + : m_owning(true) + { + NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream)); + } + + cuda_stream(cuda_stream stream, bool owning) + : m_stream(stream) + , m_owning(owning) + {} + + // destroy the stream if it's owning + void destroy() + { + if (m_owning) + { + NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(m_stream)); + } + } + + ~cuda_stream() { destroy(); } // move-only cuda_stream(const cuda_stream &) = delete; - cuda_stream(cuda_stream &&) = default; cuda_stream &operator=(const cuda_stream &) = delete; - cuda_stream &operator=(cuda_stream &&) = default; + + cuda_stream(cuda_stream &&other) + : m_stream(other.get_stream()) + , m_owning(other.is_owning()) + { + other.destroy(); + } + + cuda_stream &operator=(cuda_stream &&other) + { + m_stream = other.get_stream(); + m_owning = other.is_owning(); + + other.destroy(); + + return *this; + } operator cudaStream_t() const { return m_stream; } + cudaStream_t get_stream() const { return m_stream; } + + bool is_owning() const { return m_owning; } + private: cudaStream_t m_stream; + bool m_owning; }; } // namespace nvbench From 8aea3e467ec0c73dd2c52ac0a961bc01b2094674 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 4 Feb 2022 13:51:30 -0500 Subject: [PATCH 02/18] Add a cuda stream member to nvbench::state --- nvbench/cuda_stream.cuh | 10 +++++----- nvbench/state.cuh | 19 ++++++++++++++----- nvbench/state.cxx | 6 ++++-- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh index 2ed1e5e..3cd2fb4 100644 --- a/nvbench/cuda_stream.cuh +++ b/nvbench/cuda_stream.cuh @@ -29,14 +29,14 @@ namespace nvbench struct cuda_stream { cuda_stream() - : m_owning(true) + : m_owning{true} { NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream)); } cuda_stream(cuda_stream stream, bool owning) - : m_stream(stream) - , m_owning(owning) + : m_stream{stream} + , m_owning{owning} {} // destroy the stream if it's owning @@ -55,8 +55,8 @@ struct cuda_stream cuda_stream &operator=(const cuda_stream &) = delete; cuda_stream(cuda_stream &&other) - : m_stream(other.get_stream()) - , m_owning(other.is_owning()) + : m_stream{other.get_stream()} + , m_owning{other.is_owning()} { other.destroy(); } diff --git a/nvbench/state.cuh b/nvbench/state.cuh index 0b0b4cc..b244c0d 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -18,6 +18,7 @@ #pragma once +#include #include #include #include @@ -62,6 +63,15 @@ struct state state &operator=(const state &) = delete; state &operator=(state &&) = default; + [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const + { + return m_cuda_stream; + } + void set_cuda_stream(nvbench::cuda_stream &&stream) + { + m_cuda_stream = std::move(stream); + } + /// The CUDA device associated with with this benchmark state. May be /// nullopt for CPU-only benchmarks. [[nodiscard]] const std::optional &get_device() const @@ -259,11 +269,9 @@ struct state [[nodiscard]] bool is_cupti_required() const { - return is_l2_hit_rate_collected() - || is_l1_hit_rate_collected() - || is_stores_efficiency_collected() - || is_loads_efficiency_collected() - || is_dram_throughput_collected(); + return is_l2_hit_rate_collected() || is_l1_hit_rate_collected() || + is_stores_efficiency_collected() || + is_loads_efficiency_collected() || is_dram_throughput_collected(); } summary &add_summary(std::string summary_tag); @@ -303,6 +311,7 @@ private: std::optional device, std::size_t type_config_index); + nvbench::cuda_stream m_cuda_stream; std::reference_wrapper m_benchmark; nvbench::named_values m_axis_values; std::optional m_device; diff --git a/nvbench/state.cxx b/nvbench/state.cxx index c7139ce..bdc4d8b 100644 --- a/nvbench/state.cxx +++ b/nvbench/state.cxx @@ -33,7 +33,8 @@ namespace nvbench { state::state(const benchmark_base &bench) - : m_benchmark{bench} + : m_cuda_stream{} + , m_benchmark{bench} , m_run_once{bench.get_run_once()} , m_min_samples{bench.get_min_samples()} , m_min_time{bench.get_min_time()} @@ -46,7 +47,8 @@ state::state(const benchmark_base &bench, nvbench::named_values values, std::optional device, std::size_t type_config_index) - : m_benchmark{bench} + : m_cuda_stream{} + , m_benchmark{bench} , m_axis_values{std::move(values)} , m_device{std::move(device)} , m_type_config_index{type_config_index} From c510a0e78c240be31adc87474e3ec59f56571602 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 4 Feb 2022 13:56:02 -0500 Subject: [PATCH 03/18] Update launch to hold a const ref of nvbenc::cuda_stream --- nvbench/launch.cuh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nvbench/launch.cuh b/nvbench/launch.cuh index fa1bf95..5479462 100644 --- a/nvbench/launch.cuh +++ b/nvbench/launch.cuh @@ -25,8 +25,11 @@ namespace nvbench struct launch { + explicit launch(const nvbench::cuda_stream &stream) + : m_stream{stream} + {} + // move-only - launch() = default; launch(const launch &) = delete; launch(launch &&) = default; launch &operator=(const launch &) = delete; @@ -38,7 +41,7 @@ struct launch }; private: - nvbench::cuda_stream m_stream; + const nvbench::cuda_stream &m_stream; }; } // namespace nvbench From 14eab0774a53b82996305f5963a8da54320174d9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 4 Feb 2022 14:16:43 -0500 Subject: [PATCH 04/18] Update measure_* classes to construct launch from the state cuda stream --- nvbench/detail/measure_cold.cu | 1 + nvbench/detail/measure_cupti.cu | 7 ++++++- nvbench/detail/measure_hot.cu | 1 + nvbench/state.cxx | 6 ++---- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index f8f5309..7014064 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -39,6 +39,7 @@ namespace nvbench::detail measure_cold_base::measure_cold_base(state &exec_state) : m_state{exec_state} + , m_launch{m_state.get_cuda_stream()} , m_run_once{exec_state.get_run_once()} , m_min_samples{exec_state.get_min_samples()} , m_max_noise{exec_state.get_max_noise()} diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu index b23a486..1a09f90 100644 --- a/nvbench/detail/measure_cupti.cu +++ b/nvbench/detail/measure_cupti.cu @@ -169,7 +169,12 @@ std::vector add_metrics(nvbench::state &state) } // namespace measure_cupti_base::measure_cupti_base(state &exec_state) -try : m_state{exec_state}, m_cupti(*m_state.get_device(), add_metrics(m_state)) +try : m_state +{ + exec_state +} +, m_launch{m_state.get_cuda_stream()}, + m_cupti{*m_state.get_device(), add_metrics(m_state)} {} catch (const std::exception &ex) { diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu index d036d2e..0d4100e 100644 --- a/nvbench/detail/measure_hot.cu +++ b/nvbench/detail/measure_hot.cu @@ -37,6 +37,7 @@ namespace nvbench::detail measure_hot_base::measure_hot_base(state &exec_state) : m_state{exec_state} + , m_launch{m_state.get_cuda_stream()} , m_min_samples{exec_state.get_min_samples()} , m_min_time{exec_state.get_min_time()} , m_skip_time{exec_state.get_skip_time()} diff --git a/nvbench/state.cxx b/nvbench/state.cxx index bdc4d8b..c7139ce 100644 --- a/nvbench/state.cxx +++ b/nvbench/state.cxx @@ -33,8 +33,7 @@ namespace nvbench { state::state(const benchmark_base &bench) - : m_cuda_stream{} - , m_benchmark{bench} + : m_benchmark{bench} , m_run_once{bench.get_run_once()} , m_min_samples{bench.get_min_samples()} , m_min_time{bench.get_min_time()} @@ -47,8 +46,7 @@ state::state(const benchmark_base &bench, nvbench::named_values values, std::optional device, std::size_t type_config_index) - : m_cuda_stream{} - , m_benchmark{bench} + : m_benchmark{bench} , m_axis_values{std::move(values)} , m_device{std::move(device)} , m_type_config_index{type_config_index} From 86708ec793fa582669f60fad4cd17d86b10533b0 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 4 Feb 2022 16:03:52 -0500 Subject: [PATCH 05/18] Fix a stream destroy bug --- nvbench/cuda_stream.cuh | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh index 3cd2fb4..d4c4b5e 100644 --- a/nvbench/cuda_stream.cuh +++ b/nvbench/cuda_stream.cuh @@ -34,7 +34,7 @@ struct cuda_stream NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream)); } - cuda_stream(cuda_stream stream, bool owning) + cuda_stream(cudaStream_t stream, bool owning) : m_stream{stream} , m_owning{owning} {} @@ -42,7 +42,7 @@ struct cuda_stream // destroy the stream if it's owning void destroy() { - if (m_owning) + if (m_owning and m_stream != cudaStreamDefault) { NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(m_stream)); } @@ -56,16 +56,24 @@ struct cuda_stream cuda_stream(cuda_stream &&other) : m_stream{other.get_stream()} - , m_owning{other.is_owning()} + , m_owning{other.get_owning()} { + if (m_owning) + { + other.set_owning(not m_owning); + } other.destroy(); } cuda_stream &operator=(cuda_stream &&other) { m_stream = other.get_stream(); - m_owning = other.is_owning(); + m_owning = other.get_owning(); + if (m_owning) + { + other.set_owning(not m_owning); + } other.destroy(); return *this; @@ -75,7 +83,8 @@ struct cuda_stream cudaStream_t get_stream() const { return m_stream; } - bool is_owning() const { return m_owning; } + [[nodiscard]] bool get_owning() const { return m_owning; } + void set_owning(bool b) { m_owning = b; } private: cudaStream_t m_stream; From 439ffec1c8287067f0309b7b455f08182f83077a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 4 Feb 2022 16:35:55 -0500 Subject: [PATCH 06/18] Minor correction --- nvbench/cuda_stream.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh index d4c4b5e..0be0362 100644 --- a/nvbench/cuda_stream.cuh +++ b/nvbench/cuda_stream.cuh @@ -42,7 +42,7 @@ struct cuda_stream // destroy the stream if it's owning void destroy() { - if (m_owning and m_stream != cudaStreamDefault) + if (m_owning) { NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(m_stream)); } From 470beda9f047c5cad55f0a1cc96ec7c1127cc07b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 4 Feb 2022 16:55:29 -0500 Subject: [PATCH 07/18] Add nvbench::state stream tests --- testing/state.cu | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/testing/state.cu b/testing/state.cu index 58e8dd1..ade63ed 100644 --- a/testing/state.cu +++ b/testing/state.cu @@ -51,6 +51,23 @@ struct state_tester : public nvbench::state using nvbench::detail::state_tester; +void test_streams() +{ + dummy_bench bench; + + state_tester state{bench}; + + // Test non-owning stream + state.set_cuda_stream(nvbench::cuda_stream{cudaStreamDefault, false}); + ASSERT(state.get_cuda_stream() == cudaStreamDefault); + + // Test owning stream + auto stream = nvbench::cuda_stream{}; + auto gold = stream.get_stream(); + state.set_cuda_stream(std::move(stream)); + ASSERT(state.get_cuda_stream() == gold); +} + void test_params() { dummy_bench bench; @@ -110,6 +127,7 @@ void test_defaults() int main() { + test_streams(); test_params(); test_summaries(); test_defaults(); From 76cbbcc8f92ce0b8f8c01b5165d7ae456e262fe8 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 4 Feb 2022 17:20:40 -0500 Subject: [PATCH 08/18] Update benchmarks.md --- docs/benchmarks.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 09820f9..5ec0c22 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -11,6 +11,18 @@ void my_benchmark(nvbench::state& state) { NVBENCH_BENCH(my_benchmark); ``` +The following example shows how to benchmark functions that do not expose stream parameters: +```cpp +void my_benchmark(nvbench::state& state) { + state.set_cuda_stream(nvbench::cuda_stream{cudaStreamDefault, false}); + state.exec([](nvbench::launch&) { + my_func(); // a host API invoking GPU kernels without taking an explicit stream + my_kernel<<>>(); // or a kernel launched with the default stream + }); +} +NVBENCH_BENCH(my_benchmark); +``` + There are three main components in the definition of a benchmark: - A `KernelGenerator` callable (`my_benchmark` above) From 33a896f99eab7001efead4f23132978d56b43c69 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 4 Feb 2022 17:25:50 -0500 Subject: [PATCH 09/18] Update copyright year --- nvbench/cuda_stream.cuh | 2 +- nvbench/detail/measure_cold.cu | 2 +- nvbench/detail/measure_cupti.cu | 2 +- nvbench/detail/measure_hot.cu | 2 +- nvbench/launch.cuh | 2 +- nvbench/state.cuh | 2 +- testing/state.cu | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh index 0be0362..dfa9bb0 100644 --- a/nvbench/cuda_stream.cuh +++ b/nvbench/cuda_stream.cuh @@ -1,5 +1,5 @@ /* - * Copyright 2021 NVIDIA Corporation + * Copyright 2021-2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 with the LLVM exception * (the "License"); you may not use this file except in compliance with diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index 7014064..ec3bbbe 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -1,5 +1,5 @@ /* - * Copyright 2021 NVIDIA Corporation + * Copyright 2021-2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 with the LLVM exception * (the "License"); you may not use this file except in compliance with diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu index 1a09f90..75318ae 100644 --- a/nvbench/detail/measure_cupti.cu +++ b/nvbench/detail/measure_cupti.cu @@ -1,5 +1,5 @@ /* - * Copyright 2021 NVIDIA Corporation + * Copyright 2021-2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 with the LLVM exception * (the "License"); you may not use this file except in compliance with diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu index 0d4100e..25e2119 100644 --- a/nvbench/detail/measure_hot.cu +++ b/nvbench/detail/measure_hot.cu @@ -1,5 +1,5 @@ /* - * Copyright 2021 NVIDIA Corporation + * Copyright 2021-2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 with the LLVM exception * (the "License"); you may not use this file except in compliance with diff --git a/nvbench/launch.cuh b/nvbench/launch.cuh index 5479462..fae3f0e 100644 --- a/nvbench/launch.cuh +++ b/nvbench/launch.cuh @@ -1,5 +1,5 @@ /* - * Copyright 2021 NVIDIA Corporation + * Copyright 2021-2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 with the LLVM exception * (the "License"); you may not use this file except in compliance with diff --git a/nvbench/state.cuh b/nvbench/state.cuh index b244c0d..2369bba 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -1,5 +1,5 @@ /* - * Copyright 2021 NVIDIA Corporation + * Copyright 2021-2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 with the LLVM exception * (the "License"); you may not use this file except in compliance with diff --git a/testing/state.cu b/testing/state.cu index ade63ed..de87c81 100644 --- a/testing/state.cu +++ b/testing/state.cu @@ -1,5 +1,5 @@ /* - * Copyright 2021 NVIDIA Corporation + * Copyright 2021-2022 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 with the LLVM exception * (the "License"); you may not use this file except in compliance with From a2a12c689c3b15cec0f3bd5b05a25805b4103a0a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 6 Feb 2022 19:31:20 -0500 Subject: [PATCH 10/18] Update docs/benchmarks.md Co-authored-by: Jake Hemstad --- docs/benchmarks.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 5ec0c22..5a16aec 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -11,7 +11,13 @@ void my_benchmark(nvbench::state& state) { NVBENCH_BENCH(my_benchmark); ``` -The following example shows how to benchmark functions that do not expose stream parameters: +NVBench records the elapsed time of work on a CUDA stream for each iteration of a benchmark. +By default, NVBench creates and provides an explicit stream via `launch::get_stream()` +to pass to every stream-ordered operation. + +Sometimes it is inconvenient or impossible to specify an explicit CUDA stream to every +stream-ordered operation. A `cudaStream_t` may be provided via `state::set_cuda_stream`. +It is assumed that all work of interest executes on or synchronizes with this stream. ```cpp void my_benchmark(nvbench::state& state) { state.set_cuda_stream(nvbench::cuda_stream{cudaStreamDefault, false}); From e7c29c1c1b47fde4b03febd2d56db166348c2f77 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 6 Feb 2022 19:34:57 -0500 Subject: [PATCH 11/18] Update docs --- docs/benchmarks.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 5a16aec..351a922 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -20,7 +20,8 @@ stream-ordered operation. A `cudaStream_t` may be provided via `state::set_cuda_ It is assumed that all work of interest executes on or synchronizes with this stream. ```cpp void my_benchmark(nvbench::state& state) { - state.set_cuda_stream(nvbench::cuda_stream{cudaStreamDefault, false}); + cudaStream_t default_stream = 0; + state.set_cuda_stream(nvbench::cuda_stream{default_stream, false}); state.exec([](nvbench::launch&) { my_func(); // a host API invoking GPU kernels without taking an explicit stream my_kernel<<>>(); // or a kernel launched with the default stream From e05bf002f71356c14c6288381f5c44e891d20768 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 6 Feb 2022 20:14:41 -0500 Subject: [PATCH 12/18] Use unique_ptr + custom deleter to simplify destroy logic --- nvbench/cuda_stream.cuh | 72 ++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 45 deletions(-) diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh index dfa9bb0..1c71372 100644 --- a/nvbench/cuda_stream.cuh +++ b/nvbench/cuda_stream.cuh @@ -22,6 +22,8 @@ #include +#include + namespace nvbench { @@ -29,66 +31,46 @@ namespace nvbench struct cuda_stream { cuda_stream() - : m_owning{true} - { - NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream)); - } + : m_stream{[]() { + cudaStream_t s; + NVBENCH_CUDA_CALL(cudaStreamCreate(&s)); + return s; + }(), + stream_deleter{true}} + {} cuda_stream(cudaStream_t stream, bool owning) - : m_stream{stream} - , m_owning{owning} + : m_stream{stream, stream_deleter{owning}} {} - // destroy the stream if it's owning - void destroy() - { - if (m_owning) - { - NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(m_stream)); - } - } - - ~cuda_stream() { destroy(); } + ~cuda_stream() = default; // move-only cuda_stream(const cuda_stream &) = delete; cuda_stream &operator=(const cuda_stream &) = delete; + cuda_stream(cuda_stream &&) = default; + cuda_stream &operator=(cuda_stream &&) = default; - cuda_stream(cuda_stream &&other) - : m_stream{other.get_stream()} - , m_owning{other.get_owning()} - { - if (m_owning) - { - other.set_owning(not m_owning); - } - other.destroy(); - } + operator cudaStream_t() const { return m_stream.get(); } + + cudaStream_t get_stream() const { return m_stream.get(); } - cuda_stream &operator=(cuda_stream &&other) +private: + struct stream_deleter { - m_stream = other.get_stream(); - m_owning = other.get_owning(); + using pointer = cudaStream_t; + bool owning; - if (m_owning) + constexpr void operator()(pointer s) const noexcept { - other.set_owning(not m_owning); + if (owning) + { + NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(s)); + } } - other.destroy(); - - return *this; - } + }; - operator cudaStream_t() const { return m_stream; } - - cudaStream_t get_stream() const { return m_stream; } - - [[nodiscard]] bool get_owning() const { return m_owning; } - void set_owning(bool b) { m_owning = b; } - -private: - cudaStream_t m_stream; - bool m_owning; + std::unique_ptr m_stream; }; } // namespace nvbench From 6159d9c6cb5bff31105c77bb8241c6fc3450022b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 6 Feb 2022 20:19:21 -0500 Subject: [PATCH 13/18] Minor correction in unit test --- testing/state.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/testing/state.cu b/testing/state.cu index de87c81..a2300d1 100644 --- a/testing/state.cu +++ b/testing/state.cu @@ -58,8 +58,9 @@ void test_streams() state_tester state{bench}; // Test non-owning stream - state.set_cuda_stream(nvbench::cuda_stream{cudaStreamDefault, false}); - ASSERT(state.get_cuda_stream() == cudaStreamDefault); + cudaStream_t default_stream = 0; + state.set_cuda_stream(nvbench::cuda_stream{default_stream, false}); + ASSERT(state.get_cuda_stream() == default_stream); // Test owning stream auto stream = nvbench::cuda_stream{}; From fde2e408de13e014103a4f836725be8b8ecc6e57 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 7 Feb 2022 13:09:35 -0500 Subject: [PATCH 14/18] Add stream benchmark example --- README.md | 1 + examples/CMakeLists.txt | 1 + examples/stream.cu | 62 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 examples/stream.cu diff --git a/README.md b/README.md index a7240ec..28c5568 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ various NVBench features and usecases: - [Enums and compile-time-constant-integral parameter axes](examples/enums.cu) - [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu) - [Skipping benchmark configurations](examples/skip.cu) +- [Benchmarking on a specific stream](examples/stream.cu) - [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu) - [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_timer.cu) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a6adc80..4532bfe 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -4,6 +4,7 @@ set(example_srcs exec_tag_sync.cu exec_tag_timer.cu skip.cu + stream.cu throughput.cu auto_throughput.cu ) diff --git a/examples/stream.cu b/examples/stream.cu new file mode 100644 index 0000000..d0ca0c8 --- /dev/null +++ b/examples/stream.cu @@ -0,0 +1,62 @@ +/* + * Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +// Grab some testing kernels from NVBench: +#include + +// Thrust vectors simplify memory management: +#include + +// A function to benchmark but does not expose an explicit stream argument. +void copy(int32_t *input, int32_t *output, std::size_t const num_values) +{ + nvbench::copy_kernel<<<256, 256>>>(input, output, num_values); +} + +// `stream_bench` copies a 64 MiB buffer of int32_t on a CUDA stream specified +// by the user. +// +// By default, NVBench creates and provides an explicit stream via +// `launch::get_stream()` to pass to every stream-ordered operation. Sometimes +// it is inconvenient or impossible to specify an explicit CUDA stream to every +// stream-ordered operation. In this case, users may specify a target stream via +// `state::set_cuda_stream`. It is assumed that all work of interest executes on +// or synchronizes with this stream. +void stream_bench(nvbench::state &state) +{ + // Allocate input data: + const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t); + thrust::device_vector input(num_values); + thrust::device_vector output(num_values); + + // Set CUDA default stream as the target stream. Note the default stream + // is non-owning. + cudaStream_t default_stream = 0; + state.set_cuda_stream( + nvbench::cuda_stream{default_stream, false /*owning = false*/}); + + state.exec([&input, &output, num_values](nvbench::launch &) { + copy(thrust::raw_pointer_cast(input.data()), + thrust::raw_pointer_cast(output.data()), + num_values); + }); +} + +NVBENCH_BENCH(stream_bench); From da2ec38cdb0403933ee63136969cc2f5506ffd26 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Fri, 11 Feb 2022 13:20:05 -0500 Subject: [PATCH 15/18] Exclude some bits from clang-format. --- nvbench/detail/measure_cupti.cu | 13 +++++++------ nvbench/state.cuh | 8 ++++++-- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu index 75318ae..9e8de6c 100644 --- a/nvbench/detail/measure_cupti.cu +++ b/nvbench/detail/measure_cupti.cu @@ -169,13 +169,14 @@ std::vector add_metrics(nvbench::state &state) } // namespace measure_cupti_base::measure_cupti_base(state &exec_state) -try : m_state -{ - exec_state -} -, m_launch{m_state.get_cuda_stream()}, - m_cupti{*m_state.get_device(), add_metrics(m_state)} +// clang-format off +// (formatter doesn't handle `try :` very well...) +try + : m_state{exec_state} + , m_launch{m_state.get_cuda_stream()} + , m_cupti{*m_state.get_device(), add_metrics(m_state)} {} +// clang-format on catch (const std::exception &ex) { if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); diff --git a/nvbench/state.cuh b/nvbench/state.cuh index 2369bba..afb9a45 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -269,9 +269,13 @@ struct state [[nodiscard]] bool is_cupti_required() const { - return is_l2_hit_rate_collected() || is_l1_hit_rate_collected() || + // clang-format off + return is_l2_hit_rate_collected() || + is_l1_hit_rate_collected() || is_stores_efficiency_collected() || - is_loads_efficiency_collected() || is_dram_throughput_collected(); + is_loads_efficiency_collected() || + is_dram_throughput_collected(); + // clang-format on } summary &add_summary(std::string summary_tag); From 8ae58981ca6e65a439a07a4086ea77f1d633a889 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Fri, 11 Feb 2022 13:25:41 -0500 Subject: [PATCH 16/18] Add docs for launch and cuda_stream. --- nvbench/cuda_stream.cuh | 23 ++++++++++++++++++++++- nvbench/launch.cuh | 18 ++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh index 1c71372..f8f4a52 100644 --- a/nvbench/cuda_stream.cuh +++ b/nvbench/cuda_stream.cuh @@ -27,9 +27,19 @@ namespace nvbench { -// RAII wrapper for a cudaStream_t. +/** + * Manages and provides access to a CUDA stream. + * + * May be owning or non-owning. If the stream is owned, it will be freed with + * `cudaStreamDestroy` when the `cuda_stream`'s lifetime ends. Non-owning + * `cuda_stream`s are sometimes referred to as views. + */ struct cuda_stream { + /** + * Constructs a cuda_stream that owns a new stream, created with + * `cudaStreamCreate`. + */ cuda_stream() : m_stream{[]() { cudaStream_t s; @@ -39,6 +49,12 @@ struct cuda_stream stream_deleter{true}} {} + /** + * Constructs a `cuda_stream` from an explicit cudaStream_t. + * + * @param owning If true, `cudaStreamCreate(stream)` will be called from this + * `cuda_stream`'s destructor. + */ cuda_stream(cudaStream_t stream, bool owning) : m_stream{stream, stream_deleter{owning}} {} @@ -51,9 +67,14 @@ struct cuda_stream cuda_stream(cuda_stream &&) = default; cuda_stream &operator=(cuda_stream &&) = default; + /** + * @return The `cudaStream_t` managed by this `cuda_stream`. + * @{ + */ operator cudaStream_t() const { return m_stream.get(); } cudaStream_t get_stream() const { return m_stream.get(); } + /**@}*/ private: struct stream_deleter diff --git a/nvbench/launch.cuh b/nvbench/launch.cuh index fae3f0e..4b973f3 100644 --- a/nvbench/launch.cuh +++ b/nvbench/launch.cuh @@ -23,6 +23,18 @@ namespace nvbench { +/** + * Configuration object used to communicate with a `KernelLauncher`. + * + * The `KernelLauncher` passed into `nvbench::state::exec` is required to + * accept an `nvbench::launch` argument: + * + * ```cpp + * state.exec([](nvbench::launch &launch) { + * kernel<<>>(); + * } + * ``` + */ struct launch { explicit launch(const nvbench::cuda_stream &stream) @@ -35,12 +47,18 @@ struct launch launch &operator=(const launch &) = delete; launch &operator=(launch &&) = default; + /** + * @return a CUDA stream that all kernels and other stream-ordered CUDA work + * must use. This stream can be changed by the `KernelGenerator` using the + * `nvbench::state::set_cuda_stream` method. + */ __forceinline__ const nvbench::cuda_stream &get_stream() const { return m_stream; }; private: + // The stream is owned by the `nvbench::state` associated with this launch. const nvbench::cuda_stream &m_stream; }; From 3b4138763736d3cf184c6239bd26966e51c8fbbc Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Fri, 11 Feb 2022 13:26:10 -0500 Subject: [PATCH 17/18] Add `nvbench::make_cuda_stream_view(cudaStream_t)`. --- examples/stream.cu | 4 +--- nvbench/cuda_stream.cuh | 12 ++++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/examples/stream.cu b/examples/stream.cu index d0ca0c8..9507558 100644 --- a/examples/stream.cu +++ b/examples/stream.cu @@ -49,8 +49,7 @@ void stream_bench(nvbench::state &state) // Set CUDA default stream as the target stream. Note the default stream // is non-owning. cudaStream_t default_stream = 0; - state.set_cuda_stream( - nvbench::cuda_stream{default_stream, false /*owning = false*/}); + state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream)); state.exec([&input, &output, num_values](nvbench::launch &) { copy(thrust::raw_pointer_cast(input.data()), @@ -58,5 +57,4 @@ void stream_bench(nvbench::state &state) num_values); }); } - NVBENCH_BENCH(stream_bench); diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh index f8f4a52..6674c27 100644 --- a/nvbench/cuda_stream.cuh +++ b/nvbench/cuda_stream.cuh @@ -33,6 +33,8 @@ namespace nvbench * May be owning or non-owning. If the stream is owned, it will be freed with * `cudaStreamDestroy` when the `cuda_stream`'s lifetime ends. Non-owning * `cuda_stream`s are sometimes referred to as views. + * + * @sa nvbench::make_cuda_stream_view */ struct cuda_stream { @@ -54,6 +56,8 @@ struct cuda_stream * * @param owning If true, `cudaStreamCreate(stream)` will be called from this * `cuda_stream`'s destructor. + * + * @sa nvbench::make_cuda_stream_view */ cuda_stream(cudaStream_t stream, bool owning) : m_stream{stream, stream_deleter{owning}} @@ -94,4 +98,12 @@ private: std::unique_ptr m_stream; }; +/** + * Creates a non-owning view of the specified `stream`. + */ +inline nvbench::cuda_stream make_cuda_stream_view(cudaStream_t stream) +{ + return cuda_stream{stream, false}; +} + } // namespace nvbench From 039d45572793503541d30a5c985824fd38e1b9aa Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Fri, 11 Feb 2022 13:29:06 -0500 Subject: [PATCH 18/18] Move documentation on streams to new subsection. Also update to use `nvbench::make_cuda_stream_view`. --- docs/benchmarks.md | 49 ++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 351a922..840fc4e 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -11,25 +11,6 @@ void my_benchmark(nvbench::state& state) { NVBENCH_BENCH(my_benchmark); ``` -NVBench records the elapsed time of work on a CUDA stream for each iteration of a benchmark. -By default, NVBench creates and provides an explicit stream via `launch::get_stream()` -to pass to every stream-ordered operation. - -Sometimes it is inconvenient or impossible to specify an explicit CUDA stream to every -stream-ordered operation. A `cudaStream_t` may be provided via `state::set_cuda_stream`. -It is assumed that all work of interest executes on or synchronizes with this stream. -```cpp -void my_benchmark(nvbench::state& state) { - cudaStream_t default_stream = 0; - state.set_cuda_stream(nvbench::cuda_stream{default_stream, false}); - state.exec([](nvbench::launch&) { - my_func(); // a host API invoking GPU kernels without taking an explicit stream - my_kernel<<>>(); // or a kernel launched with the default stream - }); -} -NVBENCH_BENCH(my_benchmark); -``` - There are three main components in the definition of a benchmark: - A `KernelGenerator` callable (`my_benchmark` above) @@ -60,6 +41,36 @@ attributes to be modified. NVBENCH_BENCH(my_benchmark).set_name("my_kernel<<>>"); ``` +# CUDA Streams + +NVBench records GPU execution times on a specific CUDA stream. By default, a new +stream is created and passed to the `KernelLauncher` via the +`nvbench::launch::get_stream()` method, as shown in +[Minimal Benchmark](#minimal-benchmark). All benchmarked kernels and other +stream-ordered work must be launched on this stream for NVBench to capture it. + +In some instances, it may be inconvenient or impossible to specify an explicit +CUDA stream for the benchmarked operation to use. For example, a library may +manage and use its own streams, or an opaque API may always launch work on the +default stream. In these situations, users may provide NVBench with an explicit +stream via `nvbench::state::set_cuda_stream` and `nvbench::make_stream_view`. +It is assumed that all work of interest executes on or synchronizes with this +stream. + +```cpp +void my_benchmark(nvbench::state& state) { + cudaStream_t default_stream = 0; + state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream)); + state.exec([](nvbench::launch&) { + my_func(); // a host API invoking GPU kernels on the default stream + my_kernel<<>>(); // or a kernel launched with the default stream + }); +} +NVBENCH_BENCH(my_benchmark); +``` + +A full example can be found in [examples/stream.cu](../examples/stream.cu). + # Parameter Axes Some kernels will be used with a variety of options, input data types/sizes, and