Skip to content

Commit

Permalink
Merge pull request #85 from PointKernel/static-multi-map
Browse files Browse the repository at this point in the history
Add cuco::static_multimap
  • Loading branch information
jrhemstad authored Nov 1, 2021
2 parents bd6c30b + f7d286b commit 62b90b7
Show file tree
Hide file tree
Showing 25 changed files with 26,924 additions and 216 deletions.
38 changes: 33 additions & 5 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,18 @@ CPMAddPackage(
"RUN_HAVE_STD_REGEX 0" #
)

if (benchmark_ADDED)
# patch google benchmark target
set_target_properties(benchmark PROPERTIES CXX_STANDARD 14)
endif()
CPMAddPackage(
NAME nvbench
GITHUB_REPOSITORY NVIDIA/nvbench
GIT_TAG main
GIT_SHALLOW TRUE
)

###################################################################################################
# - compiler function -----------------------------------------------------------------------------
### compiler function #############################################################################
###################################################################################################

###################################################################################################
function(ConfigureBench BENCH_NAME BENCH_SRC)
add_executable(${BENCH_NAME} "${BENCH_SRC}")
set_target_properties(${BENCH_NAME} PROPERTIES
Expand All @@ -35,6 +39,22 @@ function(ConfigureBench BENCH_NAME BENCH_SRC)
CUDA::cudart)
endfunction(ConfigureBench)

###################################################################################################
function(ConfigureNVBench BENCH_NAME BENCH_SRC)
add_executable(${BENCH_NAME} "${BENCH_SRC}")
set_target_properties(${BENCH_NAME} PROPERTIES
POSITION_INDEPENDENT_CODE ON
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/nvbenchmarks")
target_include_directories(${BENCH_NAME} PRIVATE
"${CMAKE_CURRENT_SOURCE_DIR}")
#"${NVBench_SOURCE_DIR}")
target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr)
target_link_libraries(${BENCH_NAME} PRIVATE
nvbench::main
pthread
cuco)
endfunction(ConfigureNVBench)

###################################################################################################
### test sources ##################################################################################
###################################################################################################
Expand All @@ -47,6 +67,14 @@ ConfigureBench(DYNAMIC_MAP_BENCH "${DYNAMIC_MAP_BENCH_SRC}")
set(STATIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_map_bench.cu")
ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}")

###################################################################################################
set(STATIC_MULTIMAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_multimap/static_multimap_bench.cu")
ConfigureNVBench(STATIC_MULTIMAP_BENCH "${STATIC_MULTIMAP_BENCH_SRC}")

###################################################################################################
set(RETRIEVE_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_multimap/retrieve_bench.cu")
ConfigureNVBench(RETRIEVE_BENCH "${RETRIEVE_SRC}")

###################################################################################################
set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu")
ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}")
596 changes: 596 additions & 0 deletions benchmarks/analysis/notebooks/StaticMultimap.ipynb

Large diffs are not rendered by default.

99 changes: 99 additions & 0 deletions benchmarks/analysis/notebooks/Utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

# Global parameters
colors = ['b','r','g','m','y','c']
styles = ['o','s','v','^','D',">"]

def plot_single_perf(bm, df, xaxis, unique_labels):
fig = fig = plt.figure(1,figsize=(5, 5))
fig.suptitle(bm)

ax = fig.gca()
ax.set_xlabel(xaxis)
ax.set_ylabel('GPU Time (sec)')

ax.set_xscale('log')
ax.set_xticks(list(df[xaxis]))
ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())

marker_handles = []

num_style = len(df["Distribution"].unique())

# Iterate over labels and label indices
for lindex, lbl in enumerate(unique_labels):
tmpdf = df.loc[df['Label'] == lbl]

x = tmpdf[xaxis]
perf = tmpdf["GPU Time (sec)"]

# Get style & type index
sid = lindex % num_style
tid = int(lindex / num_style)

if not tid:
ax.plot(x, perf, color=colors[sid])
ax.scatter(x, perf, color=colors[sid], marker=styles[sid])

# Add legend
marker_handles.append(ax.plot([], [], c=colors[sid], marker=styles[sid], \
label=lbl)[0])
else:
ax.plot(x, perf, color=colors[sid], linestyle="--")
ax.scatter(x, perf, color=colors[sid], marker=styles[sid], facecolors='none')

# Add legend
marker_handles.append(ax.plot([], [], c=colors[sid], marker=styles[sid], \
mfc='none', linestyle="--", label=lbl)[0])

leg = plt.legend(handles = marker_handles, loc="upper left", ncol=2, frameon=False)
plt.savefig(bm + '.eps')

def plot_dual_perf(bm, df, xaxis, unique_labels):
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle(bm)

marker_handles = []

lax = [ax1, ax2, ax3]

for item in lax:
item.set_xlabel(xaxis)
item.set_ylabel("GPU Time (sec)")

num_style = len(df["Distribution"].unique())

# Iterate over labels and label indices
for lindex, lbl in enumerate(unique_labels):
tmpdf = df.loc[df['Label'] == lbl]

x = tmpdf[xaxis]
perf = tmpdf["GPU Time (sec)"]

# Get style & type index
sid = lindex % num_style
tid = int(lindex / num_style)

# INT32
if not tid:
lax[sid].plot(x, perf, color=colors[sid])
lax[sid].scatter(x, perf, color=colors[sid], marker=styles[sid])

# Add legend
marker_handles.append(lax[sid].plot([], [], c=colors[sid], marker=styles[sid], \
label=lbl)[0])
# INT64
else:

lax[sid].plot(x, perf, color=colors[sid], linestyle="--")
lax[sid].scatter(x, perf, color=colors[sid], marker=styles[sid], facecolors='none')

# Add legend
marker_handles.append(lax[sid].plot([], [], c=colors[sid], marker=styles[sid], \
mfc='none', linestyle="--", label=lbl)[0])

leg = plt.legend(handles = marker_handles, loc="upper left", ncol=2, frameon=False)
plt.savefig(bm + '.eps')
123 changes: 123 additions & 0 deletions benchmarks/hash_table/static_multimap/retrieve_bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <nvbench/nvbench.cuh>

#include <thrust/device_vector.h>
#include <random>

#include "cuco/static_multimap.cuh"

/**
* @brief Generates input keys by a given number of repetitions per key.
*
*/
template <typename Key, typename OutputIt>
static void generate_multikeys(OutputIt output_begin,
OutputIt output_end,
size_t const multiplicity)
{
auto num_keys = std::distance(output_begin, output_end);

for (auto i = 0; i < num_keys; ++i) {
output_begin[i] = (i % (num_keys / multiplicity)) + 1;
}
}

/**
* @brief A benchmark evaluating multi-value retrieval performance by varing number of repetitions
* per key:
* - 100'000'000 keys are inserted
* - Map occupancy is fixed at 0.4
* - Number of repetitions per key: 1, ... , 128, 256
*
*/
template <typename Key, typename Value, nvbench::int32_t CGSize, nvbench::int32_t BufferSize>
std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_retrieve(
nvbench::state& state,
nvbench::type_list<Key, Value, nvbench::enum_type<CGSize>, nvbench::enum_type<BufferSize>>)
{
std::size_t const num_keys = state.get_int64("NumInputs");
auto const occupancy = state.get_float64("Occupancy");
std::size_t const size = num_keys / occupancy;
std::size_t const multiplicity = state.get_int64("Multiplicity");

state.add_element_count(num_keys, "NumKeys");
state.add_global_memory_writes<Key>(num_keys * 2);

std::vector<Key> h_keys(num_keys);
std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);

generate_multikeys<Key>(h_keys.begin(), h_keys.end(), multiplicity);
for (auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;
h_pairs[i].second = val;
}

thrust::device_vector<Key> d_keys(h_keys);
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);

cuco::static_multimap<Key,
Value,
cuda::thread_scope_device,
cuco::cuda_allocator<char>,
cuco::double_hashing<CGSize,
cuco::detail::MurmurHash3_32<Key>,
cuco::detail::MurmurHash3_32<Key>>>
map{size, -1, -1};
map.insert(d_pairs.begin(), d_pairs.end());

auto const output_size = map.count_outer(d_keys.begin(), d_keys.end());
thrust::device_vector<cuco::pair_type<Key, Value>> d_results(output_size);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.data().get(), launch.get_stream());
});
}

template <typename Key, typename Value, nvbench::int32_t CGSize, nvbench::int32_t BufferSize>
std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_retrieve(
nvbench::state& state,
nvbench::type_list<Key, Value, nvbench::enum_type<CGSize>, nvbench::enum_type<BufferSize>>)
{
state.skip("Key should be the same type as Value.");
}

using key_type = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
using value_type = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
using cg_size = nvbench::enum_type_list<1, 2, 4, 8, 16, 32>;
using buffer_size = nvbench::enum_type_list<1, 2, 4, 8, 16>;

NVBENCH_BENCH_TYPES(nvbench_retrieve,
NVBENCH_TYPE_AXES(key_type, value_type, cg_size, nvbench::enum_type_list<2>))
.set_type_axes_names({"Key", "Value", "CGSize", "BufferSize"})
.set_timeout(100) // Custom timeout: 100 s. Default is 15 s.
.set_max_noise(3) // Custom noise: 3%. By default: 0.5%.
.add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000
.add_float64_axis("Occupancy", {0.4})
.add_int64_power_of_two_axis("Multiplicity", nvbench::range(0, 8, 1));

NVBENCH_BENCH_TYPES(
nvbench_retrieve,
NVBENCH_TYPE_AXES(key_type, value_type, nvbench::enum_type_list<8>, buffer_size))
.set_type_axes_names({"Key", "Value", "CGSize", "BufferSize"})
.set_timeout(100) // Custom timeout: 100 s. Default is 15 s.
.set_max_noise(3) // Custom noise: 3%. By default: 0.5%.
.add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000
.add_float64_axis("Occupancy", {0.4})
.add_int64_power_of_two_axis("Multiplicity", nvbench::range(0, 8, 1));
Loading

0 comments on commit 62b90b7

Please sign in to comment.