NVIDIA · jrhemstad · Nov 1, 2021 · Mar 5, 2021 · Mar 7, 2021 · Mar 7, 2021
@@ -12,14 +12,18 @@ CPMAddPackage(
     "RUN_HAVE_STD_REGEX 0" #
 )
 
-if (benchmark_ADDED)
-  # patch google benchmark target
-  set_target_properties(benchmark PROPERTIES CXX_STANDARD 14)
-endif()
+CPMAddPackage(
+  NAME nvbench
+  GITHUB_REPOSITORY NVIDIA/nvbench
+  GIT_TAG main
+  GIT_SHALLOW TRUE
+)
 
 ###################################################################################################
-# - compiler function -----------------------------------------------------------------------------
+### compiler function #############################################################################
+###################################################################################################
 
+###################################################################################################
 function(ConfigureBench BENCH_NAME BENCH_SRC)
     add_executable(${BENCH_NAME} "${BENCH_SRC}")
     set_target_properties(${BENCH_NAME} PROPERTIES
@@ -35,6 +39,22 @@ function(ConfigureBench BENCH_NAME BENCH_SRC)
                                         CUDA::cudart)
 endfunction(ConfigureBench)
 
+###################################################################################################
+function(ConfigureNVBench BENCH_NAME BENCH_SRC)
+    add_executable(${BENCH_NAME} "${BENCH_SRC}")
+    set_target_properties(${BENCH_NAME} PROPERTIES
+                                        POSITION_INDEPENDENT_CODE ON
+                                        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/nvbenchmarks")
+    target_include_directories(${BENCH_NAME} PRIVATE
+                                             "${CMAKE_CURRENT_SOURCE_DIR}")
+                                             #"${NVBench_SOURCE_DIR}")
+    target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr)
+    target_link_libraries(${BENCH_NAME} PRIVATE
+                                        nvbench::main
+                                        pthread
+                                        cuco)
+endfunction(ConfigureNVBench)
+
 ###################################################################################################
 ### test sources ##################################################################################
 ###################################################################################################
@@ -47,6 +67,14 @@ ConfigureBench(DYNAMIC_MAP_BENCH "${DYNAMIC_MAP_BENCH_SRC}")
 set(STATIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_map_bench.cu")
 ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}")
 
+###################################################################################################
+set(STATIC_MULTIMAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_multimap/static_multimap_bench.cu")
+ConfigureNVBench(STATIC_MULTIMAP_BENCH "${STATIC_MULTIMAP_BENCH_SRC}")
+
+###################################################################################################
+set(FIND_ALL_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_multimap/find_all_bench.cu")
+ConfigureNVBench(FIND_ALL_BENCH "${FIND_ALL_BENCH_SRC}")
+
 ###################################################################################################
 set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu")
 ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}")
@@ -0,0 +1,99 @@
+# Import libraries
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib
+
+# Global parameters
+colors = ['b','r','g','m','y','c']
+styles = ['o','s','v','^','D',">"]
+
+def plot_single_perf(bm, df, xaxis, unique_labels):
+    fig = fig = plt.figure(1,figsize=(5, 5))
+    fig.suptitle(bm)
+
+    ax = fig.gca()
+    ax.set_xlabel(xaxis)
+    ax.set_ylabel('GPU Time (sec)')
+
+    ax.set_xscale('log')
+    ax.set_xticks(list(df[xaxis]))
+    ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
+
+    marker_handles = []
+
+    num_style = len(df["Distribution"].unique())
+
+    # Iterate over labels and label indices
+    for lindex, lbl in enumerate(unique_labels):
+        tmpdf = df.loc[df['Label'] == lbl]
+
+        x = tmpdf[xaxis]
+        perf = tmpdf["GPU Time (sec)"]
+
+        # Get style & type index
+        sid = lindex % num_style
+        tid = int(lindex / num_style)
+
+        if not tid:
+            ax.plot(x, perf, color=colors[sid])
+            ax.scatter(x, perf, color=colors[sid], marker=styles[sid])
+
+            # Add legend
+            marker_handles.append(ax.plot([], [], c=colors[sid], marker=styles[sid], \
+                                          label=lbl)[0])
+        else:
+            ax.plot(x, perf, color=colors[sid], linestyle="--")
+            ax.scatter(x, perf, color=colors[sid], marker=styles[sid], facecolors='none')
+
+            # Add legend
+            marker_handles.append(ax.plot([], [], c=colors[sid], marker=styles[sid], \
+                                          mfc='none', linestyle="--", label=lbl)[0])
+
+    leg = plt.legend(handles = marker_handles, loc="upper left", ncol=2, frameon=False)
+    plt.savefig(bm + '.eps')
+
+def plot_dual_perf(bm, df, xaxis, unique_labels):
+    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
+    fig.suptitle(bm)
+
+    marker_handles = []
+
+    lax = [ax1, ax2, ax3]
+
+    for item in lax:
+        item.set_xlabel(xaxis)
+        item.set_ylabel("GPU Time (sec)")
+
+    num_style = len(df["Distribution"].unique())
+
+    # Iterate over labels and label indices
+    for lindex, lbl in enumerate(unique_labels):
+        tmpdf = df.loc[df['Label'] == lbl]
+
+        x = tmpdf[xaxis]
+        perf = tmpdf["GPU Time (sec)"]
+
+        # Get style & type index
+        sid = lindex % num_style
+        tid = int(lindex / num_style)
+
+        # INT32
+        if not tid:
+            lax[sid].plot(x, perf, color=colors[sid])
+            lax[sid].scatter(x, perf, color=colors[sid], marker=styles[sid])
+
+            # Add legend
+            marker_handles.append(lax[sid].plot([], [], c=colors[sid], marker=styles[sid], \
+                                          label=lbl)[0])
+        # INT64
+        else:
+
+            lax[sid].plot(x, perf, color=colors[sid], linestyle="--")
+            lax[sid].scatter(x, perf, color=colors[sid], marker=styles[sid], facecolors='none')
+
+            # Add legend
+            marker_handles.append(lax[sid].plot([], [], c=colors[sid], marker=styles[sid], \
+                                          mfc='none', linestyle="--", label=lbl)[0])
+
+    leg = plt.legend(handles = marker_handles, loc="upper left", ncol=2, frameon=False)
+    plt.savefig(bm + '.eps')
diff --git a/benchmarks/hash_table/static_multimap/find_all_bench.cu b/benchmarks/hash_table/static_multimap/find_all_bench.cu
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <random>
+
+#include "cuco/static_multimap.cuh"
+
+/**
+ * @brief Generates input keys by a given number of repetitions per key.
+ *
+ */
+template <typename Key, typename OutputIt>
+static void generate_multikeys(OutputIt output_begin,
+                               OutputIt output_end,
+                               size_t const multiplicity)
+{
+  auto num_keys = std::distance(output_begin, output_end);
+
+  for (auto i = 0; i < num_keys; ++i) {
+    output_begin[i] = (i % (num_keys / multiplicity)) + 1;
+  }
+}
+
+/**
+ * @brief A benchmark evaluating multi-value retrieval performance by varing number of repetitions
+ * per key:
+ * - 100'000'000 keys are inserted
+ * - Map occupancy is fixed at 0.4
+ * - Number of repetitions per key: 1, ... , 128, 256
+ *
+ */
+template <typename Key, typename Value, nvbench::int32_t CGSize, nvbench::int32_t BufferSize>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_find_all(
+  nvbench::state& state,
+  nvbench::type_list<Key, Value, nvbench::enum_type<CGSize>, nvbench::enum_type<BufferSize>>)
+{
+  std::size_t const num_keys     = state.get_int64("NumInputs");
+  auto const occupancy           = state.get_float64("Occupancy");
+  std::size_t const size         = num_keys / occupancy;
+  std::size_t const multiplicity = state.get_int64("Multiplicity");
+
+  state.add_element_count(num_keys, "NumKeys");
+  state.add_global_memory_writes<Key>(num_keys * 2);
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+
+  generate_multikeys<Key>(h_keys.begin(), h_keys.end(), multiplicity);
+  for (auto i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i];
+    Value val         = h_keys[i];
+    h_pairs[i].first  = key;
+    h_pairs[i].second = val;
+  }
+
+  thrust::device_vector<Key> d_keys(h_keys);
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+
+  cuco::static_multimap<Key,
+                        Value,
+                        cuda::thread_scope_device,
+                        cuco::cuda_allocator<char>,
+                        cuco::double_hashing<CGSize,
+                                             cuco::detail::MurmurHash3_32<Key>,
+                                             cuco::detail::MurmurHash3_32<Key>>>
+    map{size, -1, -1};
+  map.insert(d_pairs.begin(), d_pairs.end());
+
+  auto const output_size = map.count_outer(d_keys.begin(), d_keys.end());
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_results(output_size);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.data().get(), launch.get_stream());
+  });
+}
+
+template <typename Key, typename Value, nvbench::int32_t CGSize, nvbench::int32_t BufferSize>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_find_all(
+  nvbench::state& state,
+  nvbench::type_list<Key, Value, nvbench::enum_type<CGSize>, nvbench::enum_type<BufferSize>>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+using key_type    = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
+using value_type  = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
+using cg_size     = nvbench::enum_type_list<1, 2, 4, 8, 16, 32>;
+using buffer_size = nvbench::enum_type_list<1, 2, 4, 8, 16>;
+
+NVBENCH_BENCH_TYPES(nvbench_find_all,
+                    NVBENCH_TYPE_AXES(key_type, value_type, cg_size, nvbench::enum_type_list<2>))
+  .set_type_axes_names({"Key", "Value", "CGSize", "BufferSize"})
+  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
+  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
+  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
+  .add_float64_axis("Occupancy", {0.4})
+  .add_int64_power_of_two_axis("Multiplicity", nvbench::range(0, 8, 1));
+
+NVBENCH_BENCH_TYPES(
+  nvbench_find_all,
+  NVBENCH_TYPE_AXES(key_type, value_type, nvbench::enum_type_list<8>, buffer_size))
+  .set_type_axes_names({"Key", "Value", "CGSize", "BufferSize"})
+  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
+  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
+  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
+  .add_float64_axis("Occupancy", {0.4})
+  .add_int64_power_of_two_axis("Multiplicity", nvbench::range(0, 8, 1));