Skip to content

Commit

Permalink
Merge pull request #76 from PointKernel/add-implicit-stream-support
Browse files Browse the repository at this point in the history
Add implicit stream benchmarking support
  • Loading branch information
alliepiper authored Feb 11, 2022
2 parents b1b6d73 + 039d455 commit 38cecd5
Show file tree
Hide file tree
Showing 11 changed files with 237 additions and 21 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ various NVBench features and usecases:
- [Enums and compile-time-constant-integral parameter axes](examples/enums.cu)
- [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
- [Skipping benchmark configurations](examples/skip.cu)
- [Benchmarking on a specific stream](examples/stream.cu)
- [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu)
- [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_timer.cu)
Expand Down
30 changes: 30 additions & 0 deletions docs/benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,36 @@ attributes to be modified.
NVBENCH_BENCH(my_benchmark).set_name("my_kernel<<<num_blocks, 256>>>");
```

# CUDA Streams

NVBench records GPU execution times on a specific CUDA stream. By default, a new
stream is created and passed to the `KernelLauncher` via the
`nvbench::launch::get_stream()` method, as shown in
[Minimal Benchmark](#minimal-benchmark). All benchmarked kernels and other
stream-ordered work must be launched on this stream for NVBench to capture it.

In some instances, it may be inconvenient or impossible to specify an explicit
CUDA stream for the benchmarked operation to use. For example, a library may
manage and use its own streams, or an opaque API may always launch work on the
default stream. In these situations, users may provide NVBench with an explicit
stream via `nvbench::state::set_cuda_stream` and `nvbench::make_stream_view`.
It is assumed that all work of interest executes on or synchronizes with this
stream.

```cpp
void my_benchmark(nvbench::state& state) {
cudaStream_t default_stream = 0;
state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
state.exec([](nvbench::launch&) {
my_func(); // a host API invoking GPU kernels on the default stream
my_kernel<<<num_blocks, 256>>>(); // or a kernel launched with the default stream
});
}
NVBENCH_BENCH(my_benchmark);
```
A full example can be found in [examples/stream.cu](../examples/stream.cu).
# Parameter Axes
Some kernels will be used with a variety of options, input data types/sizes, and
Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ set(example_srcs
exec_tag_sync.cu
exec_tag_timer.cu
skip.cu
stream.cu
throughput.cu
auto_throughput.cu
)
Expand Down
60 changes: 60 additions & 0 deletions examples/stream.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Copyright 2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <nvbench/nvbench.cuh>

// Grab some testing kernels from NVBench:
#include <nvbench/test_kernels.cuh>

// Thrust vectors simplify memory management:
#include <thrust/device_vector.h>

// A function to benchmark but does not expose an explicit stream argument.
void copy(int32_t *input, int32_t *output, std::size_t const num_values)
{
nvbench::copy_kernel<<<256, 256>>>(input, output, num_values);
}

// `stream_bench` copies a 64 MiB buffer of int32_t on a CUDA stream specified
// by the user.
//
// By default, NVBench creates and provides an explicit stream via
// `launch::get_stream()` to pass to every stream-ordered operation. Sometimes
// it is inconvenient or impossible to specify an explicit CUDA stream to every
// stream-ordered operation. In this case, users may specify a target stream via
// `state::set_cuda_stream`. It is assumed that all work of interest executes on
// or synchronizes with this stream.
void stream_bench(nvbench::state &state)
{
// Allocate input data:
const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
thrust::device_vector<nvbench::int32_t> input(num_values);
thrust::device_vector<nvbench::int32_t> output(num_values);

// Set CUDA default stream as the target stream. Note the default stream
// is non-owning.
cudaStream_t default_stream = 0;
state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));

state.exec([&input, &output, num_values](nvbench::launch &) {
copy(thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),
num_values);
});
}
NVBENCH_BENCH(stream_bench);
77 changes: 70 additions & 7 deletions nvbench/cuda_stream.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
Expand All @@ -22,25 +22,88 @@

#include <cuda_runtime_api.h>

#include <memory>

namespace nvbench
{

// RAII wrapper for a cudaStream_t.
/**
* Manages and provides access to a CUDA stream.
*
* May be owning or non-owning. If the stream is owned, it will be freed with
* `cudaStreamDestroy` when the `cuda_stream`'s lifetime ends. Non-owning
* `cuda_stream`s are sometimes referred to as views.
*
* @sa nvbench::make_cuda_stream_view
*/
struct cuda_stream
{
cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamCreate(&m_stream)); }
~cuda_stream() { NVBENCH_CUDA_CALL(cudaStreamDestroy(m_stream)); }
/**
* Constructs a cuda_stream that owns a new stream, created with
* `cudaStreamCreate`.
*/
cuda_stream()
: m_stream{[]() {
cudaStream_t s;
NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
return s;
}(),
stream_deleter{true}}
{}

/**
* Constructs a `cuda_stream` from an explicit cudaStream_t.
*
* @param owning If true, `cudaStreamCreate(stream)` will be called from this
* `cuda_stream`'s destructor.
*
* @sa nvbench::make_cuda_stream_view
*/
cuda_stream(cudaStream_t stream, bool owning)
: m_stream{stream, stream_deleter{owning}}
{}

~cuda_stream() = default;

// move-only
cuda_stream(const cuda_stream &) = delete;
cuda_stream(cuda_stream &&) = default;
cuda_stream &operator=(const cuda_stream &) = delete;
cuda_stream(cuda_stream &&) = default;
cuda_stream &operator=(cuda_stream &&) = default;

operator cudaStream_t() const { return m_stream; }
/**
* @return The `cudaStream_t` managed by this `cuda_stream`.
* @{
*/
operator cudaStream_t() const { return m_stream.get(); }

cudaStream_t get_stream() const { return m_stream.get(); }
/**@}*/

private:
cudaStream_t m_stream;
struct stream_deleter
{
using pointer = cudaStream_t;
bool owning;

constexpr void operator()(pointer s) const noexcept
{
if (owning)
{
NVBENCH_CUDA_CALL_NOEXCEPT(cudaStreamDestroy(s));
}
}
};

std::unique_ptr<cudaStream_t, stream_deleter> m_stream;
};

/**
* Creates a non-owning view of the specified `stream`.
*/
inline nvbench::cuda_stream make_cuda_stream_view(cudaStream_t stream)
{
return cuda_stream{stream, false};
}

} // namespace nvbench
3 changes: 2 additions & 1 deletion nvbench/detail/measure_cold.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
Expand Down Expand Up @@ -39,6 +39,7 @@ namespace nvbench::detail

measure_cold_base::measure_cold_base(state &exec_state)
: m_state{exec_state}
, m_launch{m_state.get_cuda_stream()}
, m_run_once{exec_state.get_run_once()}
, m_min_samples{exec_state.get_min_samples()}
, m_max_noise{exec_state.get_max_noise()}
Expand Down
10 changes: 8 additions & 2 deletions nvbench/detail/measure_cupti.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
Expand Down Expand Up @@ -169,8 +169,14 @@ std::vector<std::string> add_metrics(nvbench::state &state)
} // namespace

measure_cupti_base::measure_cupti_base(state &exec_state)
try : m_state{exec_state}, m_cupti(*m_state.get_device(), add_metrics(m_state))
// clang-format off
// (formatter doesn't handle `try :` very well...)
try
: m_state{exec_state}
, m_launch{m_state.get_cuda_stream()}
, m_cupti{*m_state.get_device(), add_metrics(m_state)}
{}
// clang-format on
catch (const std::exception &ex)
{
if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
Expand Down
3 changes: 2 additions & 1 deletion nvbench/detail/measure_hot.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
Expand Down Expand Up @@ -37,6 +37,7 @@ namespace nvbench::detail

measure_hot_base::measure_hot_base(state &exec_state)
: m_state{exec_state}
, m_launch{m_state.get_cuda_stream()}
, m_min_samples{exec_state.get_min_samples()}
, m_min_time{exec_state.get_min_time()}
, m_skip_time{exec_state.get_skip_time()}
Expand Down
27 changes: 24 additions & 3 deletions nvbench/launch.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
Expand All @@ -23,22 +23,43 @@
namespace nvbench
{

/**
* Configuration object used to communicate with a `KernelLauncher`.
*
* The `KernelLauncher` passed into `nvbench::state::exec` is required to
* accept an `nvbench::launch` argument:
*
* ```cpp
* state.exec([](nvbench::launch &launch) {
* kernel<<<M, N, 0, launch.get_stream()>>>();
* }
* ```
*/
struct launch
{
explicit launch(const nvbench::cuda_stream &stream)
: m_stream{stream}
{}

// move-only
launch() = default;
launch(const launch &) = delete;
launch(launch &&) = default;
launch &operator=(const launch &) = delete;
launch &operator=(launch &&) = default;

/**
* @return a CUDA stream that all kernels and other stream-ordered CUDA work
* must use. This stream can be changed by the `KernelGenerator` using the
* `nvbench::state::set_cuda_stream` method.
*/
__forceinline__ const nvbench::cuda_stream &get_stream() const
{
return m_stream;
};

private:
nvbench::cuda_stream m_stream;
// The stream is owned by the `nvbench::state` associated with this launch.
const nvbench::cuda_stream &m_stream;
};

} // namespace nvbench
25 changes: 19 additions & 6 deletions nvbench/state.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
Expand All @@ -18,6 +18,7 @@

#pragma once

#include <nvbench/cuda_stream.cuh>
#include <nvbench/device_info.cuh>
#include <nvbench/exec_tag.cuh>
#include <nvbench/named_values.cuh>
Expand Down Expand Up @@ -62,6 +63,15 @@ struct state
state &operator=(const state &) = delete;
state &operator=(state &&) = default;

[[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const
{
return m_cuda_stream;
}
void set_cuda_stream(nvbench::cuda_stream &&stream)
{
m_cuda_stream = std::move(stream);
}

/// The CUDA device associated with with this benchmark state. May be
/// nullopt for CPU-only benchmarks.
[[nodiscard]] const std::optional<nvbench::device_info> &get_device() const
Expand Down Expand Up @@ -259,11 +269,13 @@ struct state

[[nodiscard]] bool is_cupti_required() const
{
return is_l2_hit_rate_collected()
|| is_l1_hit_rate_collected()
|| is_stores_efficiency_collected()
|| is_loads_efficiency_collected()
|| is_dram_throughput_collected();
// clang-format off
return is_l2_hit_rate_collected() ||
is_l1_hit_rate_collected() ||
is_stores_efficiency_collected() ||
is_loads_efficiency_collected() ||
is_dram_throughput_collected();
// clang-format on
}

summary &add_summary(std::string summary_tag);
Expand Down Expand Up @@ -303,6 +315,7 @@ private:
std::optional<nvbench::device_info> device,
std::size_t type_config_index);

nvbench::cuda_stream m_cuda_stream;
std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
nvbench::named_values m_axis_values;
std::optional<nvbench::device_info> m_device;
Expand Down
Loading

0 comments on commit 38cecd5

Please sign in to comment.