New benchmark compares concurrent throughput of device_vector and dev…

…ice_uvector (#981) Adds a new benchmark in `device_uvector_benchmark.cpp` that compares using multiple streams and concurrent kernels interleaved with vector creation. This is then parameterized on the type of the vector: 1. `thrust::device_vector` -- uses cudaMalloc allocation 2. `rmm::device_vector` -- uses RMM allocation 3. `rmm::device_uvector` -- uses RMM allocation and uninitialized vector The benchmark uses the `cuda_async_memory_resource` so that cudaMallocAsync is used for allocation of the `rmm::` vector types. The performance on V100 demonstrates that option 1. is slowest due to allocation bottlenecks. 2. alleviates these by using `cudaMallocFromPoolAsync`, but there is no concurrency among the kernels because `thrust::device_vector` synchronizes the default stream. 3. Is fastest and achieves full concurrency (verified in `nsight-sys`). ```---------------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ---------------------------------------------------------------------------------------------------------------------------------- BM_VectorWorkflow<thrust::device_vector<int32_t>>/100000/manual_time 242 us 267 us 2962 bytes_per_second=13.8375G/s BM_VectorWorkflow<thrust::device_vector<int32_t>>/1000000/manual_time 1441 us 1465 us 472 bytes_per_second=23.273G/s BM_VectorWorkflow<thrust::device_vector<int32_t>>/10000000/manual_time 10483 us 10498 us 68 bytes_per_second=31.9829G/s BM_VectorWorkflow<thrust::device_vector<int32_t>>/100000000/manual_time 63583 us 63567 us 12 bytes_per_second=52.7303G/s BM_VectorWorkflow<rmm::device_vector<int32_t>>/100000/manual_time 82.0 us 105 us 8181 bytes_per_second=40.8661G/s BM_VectorWorkflow<rmm::device_vector<int32_t>>/1000000/manual_time 502 us 527 us 1357 bytes_per_second=66.8029G/s BM_VectorWorkflow<rmm::device_vector<int32_t>>/10000000/manual_time 4714 us 4746 us 148 bytes_per_second=71.1222G/s BM_VectorWorkflow<rmm::device_vector<int32_t>>/100000000/manual_time 46451 us 46478 us 13 bytes_per_second=72.1784G/s BM_VectorWorkflow<rmm::device_uvector<int32_t>>/100000/manual_time 39.0 us 59.9 us 17970 bytes_per_second=85.8733G/s BM_VectorWorkflow<rmm::device_uvector<int32_t>>/1000000/manual_time 135 us 159 us 5253 bytes_per_second=248.987G/s BM_VectorWorkflow<rmm::device_uvector<int32_t>>/10000000/manual_time 1319 us 1351 us 516 bytes_per_second=254.169G/s BM_VectorWorkflow<rmm::device_uvector<int32_t>>/100000000/manual_time 12841 us 12865 us 54 bytes_per_second=261.099G/s ``` Authors: - Mark Harris (https://github.com/harrism) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Conor Hoekstra (https://github.com/codereport) URL: #981
rapidsai · Feb 17, 2022 · cf33a5a · cf33a5a
1 parent f1e8a24
commit cf33a5a
Showing 1 changed file with 119 additions and 7 deletions.
diff --git a/benchmarks/device_uvector/device_uvector_bench.cu b/benchmarks/device_uvector/device_uvector_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,14 @@
  * limitations under the License.
  */
 
+#include "../synchronization/synchronization.hpp"
+#include "thrust/detail/raw_pointer_cast.h"
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/detail/error.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
@@ -24,18 +30,21 @@
 
 #include <cuda_runtime_api.h>
 
-static void BM_UvectorSizeConstruction(benchmark::State& state)
+#include <cstdio>
+#include <type_traits>
+
+void BM_UvectorSizeConstruction(benchmark::State& state)
 {
   rmm::mr::cuda_memory_resource cuda_mr{};
   rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{&cuda_mr};
   rmm::mr::set_current_device_resource(&mr);
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
-    rmm::device_uvector<int32_t> vec(state.range(0), rmm::cuda_stream_view{});
+    rmm::device_uvector<std::int32_t> vec(state.range(0), rmm::cuda_stream_view{});
     cudaDeviceSynchronize();
   }
 
-  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
+  state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));
 
   rmm::mr::set_current_device_resource(nullptr);
 }
@@ -45,18 +54,18 @@ BENCHMARK(BM_UvectorSizeConstruction)
   ->Range(10'000, 1'000'000'000)  // NOLINT
   ->Unit(benchmark::kMicrosecond);
 
-static void BM_ThrustVectorSizeConstruction(benchmark::State& state)
+void BM_ThrustVectorSizeConstruction(benchmark::State& state)
 {
   rmm::mr::cuda_memory_resource cuda_mr{};
   rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{&cuda_mr};
   rmm::mr::set_current_device_resource(&mr);
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
-    rmm::device_vector<int32_t> vec(state.range(0));
+    rmm::device_vector<std::int32_t> vec(state.range(0));
     cudaDeviceSynchronize();
   }
 
-  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
+  state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));
 
   rmm::mr::set_current_device_resource(nullptr);
 }
@@ -66,4 +75,107 @@ BENCHMARK(BM_ThrustVectorSizeConstruction)
   ->Range(10'000, 1'000'000'000)  // NOLINT
   ->Unit(benchmark::kMicrosecond);
 
+// simple kernel used to test concurrent execution.
+__global__ void kernel(int const* input, int* output, std::size_t num)
+{
+  for (auto i = blockDim.x * blockIdx.x + threadIdx.x; i < num; i += gridDim.x * blockDim.x) {
+    output[i] = input[i] * input[i];
+  }
+}
+
+using thrust_vector = thrust::device_vector<int32_t>;
+using rmm_vector    = rmm::device_vector<int32_t>;
+using rmm_uvector   = rmm::device_uvector<int32_t>;
+
+template <typename Vector>
+Vector make_vector(std::int64_t num_elements, rmm::cuda_stream_view stream, bool zero_init = false)
+{
+  static_assert(std::is_same_v<Vector, thrust_vector> or std::is_same_v<Vector, rmm_vector> or
+                  std::is_same_v<Vector, rmm_uvector>,
+                "unsupported vector type");
+  if constexpr (std::is_same_v<Vector, thrust_vector>) {
+    return Vector(num_elements, 0);
+  } else if constexpr (std::is_same_v<Vector, rmm_vector>) {
+    return Vector(num_elements, 0, rmm::mr::thrust_allocator<std::int32_t>(stream));
+  } else if constexpr (std::is_same_v<Vector, rmm_uvector>) {
+    auto vec = Vector(num_elements, stream);
+    if (zero_init) {
+      cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.value());
+    }
+    return vec;
+  }
+}
+
+template <typename Vector>
+int32_t* vector_data(Vector& vec)
+{
+  return thrust::raw_pointer_cast(vec.data());
+}
+
+template <typename Vector>
+void vector_workflow(std::size_t num_elements,
+                     std::int64_t num_blocks,
+                     std::int64_t block_size,
+                     rmm::cuda_stream const& input_stream,
+                     std::vector<rmm::cuda_stream> const& streams)
+{
+  auto input = make_vector<Vector>(num_elements, input_stream, true);
+  input_stream.synchronize();
+  for (rmm::cuda_stream_view stream : streams) {
+    auto output = make_vector<Vector>(num_elements, stream);
+    kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+      vector_data(input), vector_data(output), num_elements);
+  }
+
+  for (rmm::cuda_stream_view stream : streams) {
+    stream.synchronize();
+  }
+}
+
+template <typename Vector>
+void BM_VectorWorkflow(benchmark::State& state)
+{
+  rmm::mr::cuda_async_memory_resource cuda_async_mr{};
+  rmm::mr::set_current_device_resource(&cuda_async_mr);
+
+  rmm::cuda_stream input_stream;
+  std::vector<rmm::cuda_stream> streams(4);
+
+  auto const num_elements   = state.range(0);
+  auto constexpr block_size = 256;
+  auto constexpr num_blocks = 16;
+
+  for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
+    cuda_event_timer timer(state, true, input_stream);  // flush_l2_cache = true
+    vector_workflow<Vector>(num_elements, num_blocks, block_size, input_stream, streams);
+  }
+
+  auto constexpr num_accesses = 9;
+  auto const bytes            = num_elements * sizeof(std::int32_t) * num_accesses;
+  state.SetBytesProcessed(static_cast<std::int64_t>(state.iterations() * bytes));
+
+  rmm::mr::set_current_device_resource(nullptr);
+}
+
+BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector)  // NOLINT
+  ->RangeMultiplier(10)                               // NOLINT
+  ->Range(100'000, 100'000'000)                       // NOLINT
+  ->Unit(benchmark::kMicrosecond)
+  ->UseManualTime();
+
+// The only difference here is that `rmm::device_vector` uses `rmm::current_device_resource()`
+// for allocation while `thrust::device_vector` uses cudaMalloc/cudaFree. In the benchmarks we use
+// `cuda_async_memory_resource`, which is faster.
+BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_vector)  // NOLINT
+  ->RangeMultiplier(10)                            // NOLINT
+  ->Range(100'000, 100'000'000)                    // NOLINT
+  ->Unit(benchmark::kMicrosecond)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_uvector)  // NOLINT
+  ->RangeMultiplier(10)                             // NOLINT
+  ->Range(100'000, 100'000'000)                     // NOLINT
+  ->Unit(benchmark::kMicrosecond)
+  ->UseManualTime();
+
 BENCHMARK_MAIN();