Skip to content

Commit

Permalink
New benchmark compares concurrent throughput of device_vector and dev…
Browse files Browse the repository at this point in the history
…ice_uvector (#981)

Adds a new benchmark in `device_uvector_benchmark.cpp` that compares using multiple streams and concurrent kernels interleaved with vector creation. This is then parameterized on the type of the vector:

1. `thrust::device_vector` -- uses cudaMalloc allocation
2. `rmm::device_vector` -- uses RMM allocation 
3. `rmm::device_uvector` -- uses RMM allocation and uninitialized vector

The benchmark uses the `cuda_async_memory_resource` so that cudaMallocAsync is used for allocation of the `rmm::` vector types.

The performance on V100 demonstrates that option 1. is slowest due to allocation bottlenecks. 2. alleviates these by using `cudaMallocFromPoolAsync`, but there is no concurrency among the kernels because `thrust::device_vector` synchronizes the default stream. 3. Is fastest and achieves full concurrency (verified in `nsight-sys`).

```----------------------------------------------------------------------------------------------------------------------------------
Benchmark                                                                        Time             CPU   Iterations UserCounters...
----------------------------------------------------------------------------------------------------------------------------------
BM_VectorWorkflow<thrust::device_vector<int32_t>>/100000/manual_time           242 us          267 us         2962 bytes_per_second=13.8375G/s
BM_VectorWorkflow<thrust::device_vector<int32_t>>/1000000/manual_time         1441 us         1465 us          472 bytes_per_second=23.273G/s
BM_VectorWorkflow<thrust::device_vector<int32_t>>/10000000/manual_time       10483 us        10498 us           68 bytes_per_second=31.9829G/s
BM_VectorWorkflow<thrust::device_vector<int32_t>>/100000000/manual_time      63583 us        63567 us           12 bytes_per_second=52.7303G/s
BM_VectorWorkflow<rmm::device_vector<int32_t>>/100000/manual_time             82.0 us          105 us         8181 bytes_per_second=40.8661G/s
BM_VectorWorkflow<rmm::device_vector<int32_t>>/1000000/manual_time             502 us          527 us         1357 bytes_per_second=66.8029G/s
BM_VectorWorkflow<rmm::device_vector<int32_t>>/10000000/manual_time           4714 us         4746 us          148 bytes_per_second=71.1222G/s
BM_VectorWorkflow<rmm::device_vector<int32_t>>/100000000/manual_time         46451 us        46478 us           13 bytes_per_second=72.1784G/s
BM_VectorWorkflow<rmm::device_uvector<int32_t>>/100000/manual_time            39.0 us         59.9 us        17970 bytes_per_second=85.8733G/s
BM_VectorWorkflow<rmm::device_uvector<int32_t>>/1000000/manual_time            135 us          159 us         5253 bytes_per_second=248.987G/s
BM_VectorWorkflow<rmm::device_uvector<int32_t>>/10000000/manual_time          1319 us         1351 us          516 bytes_per_second=254.169G/s
BM_VectorWorkflow<rmm::device_uvector<int32_t>>/100000000/manual_time        12841 us        12865 us           54 bytes_per_second=261.099G/s
```

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Conor Hoekstra (https://github.com/codereport)

URL: #981
  • Loading branch information
harrism authored Feb 17, 2022
1 parent f1e8a24 commit cf33a5a
Showing 1 changed file with 119 additions and 7 deletions.
126 changes: 119 additions & 7 deletions benchmarks/device_uvector/device_uvector_bench.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,8 +14,14 @@
* limitations under the License.
*/

#include "../synchronization/synchronization.hpp"
#include "thrust/detail/raw_pointer_cast.h"

#include <rmm/cuda_stream.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/device_vector.hpp>
#include <rmm/mr/device/cuda_async_memory_resource.hpp>
#include <rmm/mr/device/cuda_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/mr/device/pool_memory_resource.hpp>
Expand All @@ -24,18 +30,21 @@

#include <cuda_runtime_api.h>

static void BM_UvectorSizeConstruction(benchmark::State& state)
#include <cstdio>
#include <type_traits>

void BM_UvectorSizeConstruction(benchmark::State& state)
{
rmm::mr::cuda_memory_resource cuda_mr{};
rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{&cuda_mr};
rmm::mr::set_current_device_resource(&mr);

for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
rmm::device_uvector<int32_t> vec(state.range(0), rmm::cuda_stream_view{});
rmm::device_uvector<std::int32_t> vec(state.range(0), rmm::cuda_stream_view{});
cudaDeviceSynchronize();
}

state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));

rmm::mr::set_current_device_resource(nullptr);
}
Expand All @@ -45,18 +54,18 @@ BENCHMARK(BM_UvectorSizeConstruction)
->Range(10'000, 1'000'000'000) // NOLINT
->Unit(benchmark::kMicrosecond);

static void BM_ThrustVectorSizeConstruction(benchmark::State& state)
void BM_ThrustVectorSizeConstruction(benchmark::State& state)
{
rmm::mr::cuda_memory_resource cuda_mr{};
rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{&cuda_mr};
rmm::mr::set_current_device_resource(&mr);

for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
rmm::device_vector<int32_t> vec(state.range(0));
rmm::device_vector<std::int32_t> vec(state.range(0));
cudaDeviceSynchronize();
}

state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));

rmm::mr::set_current_device_resource(nullptr);
}
Expand All @@ -66,4 +75,107 @@ BENCHMARK(BM_ThrustVectorSizeConstruction)
->Range(10'000, 1'000'000'000) // NOLINT
->Unit(benchmark::kMicrosecond);

// simple kernel used to test concurrent execution.
__global__ void kernel(int const* input, int* output, std::size_t num)
{
for (auto i = blockDim.x * blockIdx.x + threadIdx.x; i < num; i += gridDim.x * blockDim.x) {
output[i] = input[i] * input[i];
}
}

using thrust_vector = thrust::device_vector<int32_t>;
using rmm_vector = rmm::device_vector<int32_t>;
using rmm_uvector = rmm::device_uvector<int32_t>;

template <typename Vector>
Vector make_vector(std::int64_t num_elements, rmm::cuda_stream_view stream, bool zero_init = false)
{
static_assert(std::is_same_v<Vector, thrust_vector> or std::is_same_v<Vector, rmm_vector> or
std::is_same_v<Vector, rmm_uvector>,
"unsupported vector type");
if constexpr (std::is_same_v<Vector, thrust_vector>) {
return Vector(num_elements, 0);
} else if constexpr (std::is_same_v<Vector, rmm_vector>) {
return Vector(num_elements, 0, rmm::mr::thrust_allocator<std::int32_t>(stream));
} else if constexpr (std::is_same_v<Vector, rmm_uvector>) {
auto vec = Vector(num_elements, stream);
if (zero_init) {
cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.value());
}
return vec;
}
}

template <typename Vector>
int32_t* vector_data(Vector& vec)
{
return thrust::raw_pointer_cast(vec.data());
}

template <typename Vector>
void vector_workflow(std::size_t num_elements,
std::int64_t num_blocks,
std::int64_t block_size,
rmm::cuda_stream const& input_stream,
std::vector<rmm::cuda_stream> const& streams)
{
auto input = make_vector<Vector>(num_elements, input_stream, true);
input_stream.synchronize();
for (rmm::cuda_stream_view stream : streams) {
auto output = make_vector<Vector>(num_elements, stream);
kernel<<<num_blocks, block_size, 0, stream.value()>>>(
vector_data(input), vector_data(output), num_elements);
}

for (rmm::cuda_stream_view stream : streams) {
stream.synchronize();
}
}

template <typename Vector>
void BM_VectorWorkflow(benchmark::State& state)
{
rmm::mr::cuda_async_memory_resource cuda_async_mr{};
rmm::mr::set_current_device_resource(&cuda_async_mr);

rmm::cuda_stream input_stream;
std::vector<rmm::cuda_stream> streams(4);

auto const num_elements = state.range(0);
auto constexpr block_size = 256;
auto constexpr num_blocks = 16;

for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
cuda_event_timer timer(state, true, input_stream); // flush_l2_cache = true
vector_workflow<Vector>(num_elements, num_blocks, block_size, input_stream, streams);
}

auto constexpr num_accesses = 9;
auto const bytes = num_elements * sizeof(std::int32_t) * num_accesses;
state.SetBytesProcessed(static_cast<std::int64_t>(state.iterations() * bytes));

rmm::mr::set_current_device_resource(nullptr);
}

BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector) // NOLINT
->RangeMultiplier(10) // NOLINT
->Range(100'000, 100'000'000) // NOLINT
->Unit(benchmark::kMicrosecond)
->UseManualTime();

// The only difference here is that `rmm::device_vector` uses `rmm::current_device_resource()`
// for allocation while `thrust::device_vector` uses cudaMalloc/cudaFree. In the benchmarks we use
// `cuda_async_memory_resource`, which is faster.
BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_vector) // NOLINT
->RangeMultiplier(10) // NOLINT
->Range(100'000, 100'000'000) // NOLINT
->Unit(benchmark::kMicrosecond)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_uvector) // NOLINT
->RangeMultiplier(10) // NOLINT
->Range(100'000, 100'000'000) // NOLINT
->Unit(benchmark::kMicrosecond)
->UseManualTime();

BENCHMARK_MAIN();

0 comments on commit cf33a5a

Please sign in to comment.