Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Reduce compilation time #115

Closed
wants to merge 88 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
88 commits
Select commit Hold shift + click to select a range
8baac4a
Add more unit tests for custom key types
PointKernel Sep 25, 2021
02029bb
Minor cleanup: renaming variables
PointKernel Sep 25, 2021
143a5c5
Add custom type example
PointKernel Sep 26, 2021
1eed520
Get rid of the shared variable warning in single map tests
PointKernel Sep 26, 2021
262b201
Cleanups + use thrust logicals
PointKernel Sep 26, 2021
ec57180
Update example to build for pascal
PointKernel Sep 27, 2021
8d6521c
Add alignment for custom pair type
PointKernel Sep 27, 2021
5ac3ff2
Update test cmake files: license + cleanups
PointKernel Sep 28, 2021
762c74a
Add util header
PointKernel Sep 28, 2021
1b5440b
Include util header in tests
PointKernel Sep 28, 2021
d44f90d
Split tests into multiple files
PointKernel Sep 29, 2021
f2dd037
Correction: custom type example based on runtime capability check
PointKernel Oct 1, 2021
8900730
Revert back to use custom logical functions + runtime capability chec…
PointKernel Oct 1, 2021
2d7826d
Use BUILD_PASCAL_CODE macro in tests and example
PointKernel Oct 4, 2021
efeb2cb
Merge remote-tracking branch 'upstream/dev' into large-key-type
PointKernel Nov 1, 2021
bc2558c
Add plumbing for execution stream for static_map functions
chirayuG-nvidia Nov 1, 2021
b8c8e6c
Use transform iterator instead of materializing the inputs
PointKernel Nov 1, 2021
5be1e10
Fetch the latest libcudacxx + remove unnecessary alignment
PointKernel Nov 1, 2021
68227f6
Define and use CUCO_NO_INDEPENDENT_THREADS macro in sub-level CMake f…
PointKernel Nov 1, 2021
a057066
Use MATCH instead of IN_LIST in CMake
PointKernel Nov 1, 2021
bf6a90d
Use rapids_cpm_libcudacxx to support installed libcudacxx
robertmaynard Nov 2, 2021
1fee0e7
Make sure we specify libcudacxx include paths
robertmaynard Nov 2, 2021
5b05899
Merge remote-tracking branch 'robert/move_over_to_rapids_libcudacxx' …
PointKernel Nov 2, 2021
707349c
Merge branch 'large-key-type' into reduce-compilation-time
PointKernel Nov 4, 2021
062ca04
Add key_sentinel_test
PointKernel Nov 4, 2021
f9bbbf9
Split multimap tests into multiple files
PointKernel Nov 4, 2021
f0eecb2
Merge pull request #114 from robertmaynard/move_over_to_rapids_libcud…
jrhemstad Nov 8, 2021
d84dcee
Add unit test, address review comments
chirayuG-nvidia Nov 8, 2021
54ef33a
Exclude unused headers + use make_transform_iterator instead of trans…
PointKernel Nov 8, 2021
792e554
Make contains const and make necessary device view members accessible.
vyasr Nov 11, 2021
2373d71
Add tests for device-side contains.
vyasr Nov 11, 2021
7f6f1c2
Merge pull request #117 from vyasr/fix/static_map_device_contains
jrhemstad Nov 11, 2021
69f2ce6
Add static_map::insert_if.
vyasr Nov 11, 2021
e348ada
Add test of conditional insertion and fix bug.
vyasr Nov 11, 2021
f62ab46
Remove unnecessary pragma due to the use of new libcudacxx
PointKernel Nov 12, 2021
427c553
Renaming: custom type example instead of custom type
PointKernel Nov 12, 2021
5098d22
Merge remote-tracking branch 'upstream/dev' into large-key-type
PointKernel Nov 12, 2021
2142419
Resolve conflicts
PointKernel Nov 12, 2021
720d3bf
Allocate/deallocate take no stream argument
PointKernel Nov 15, 2021
719ffc5
Remove stream member variable
PointKernel Nov 15, 2021
8891da5
Change insert to insert_if_n.
vyasr Nov 17, 2021
c31ee58
Address most PR comments.
vyasr Nov 17, 2021
2bb6c01
Switch to using CG device API.
vyasr Nov 17, 2021
5da364f
Apply suggestions from code review
vyasr Nov 17, 2021
9405423
Apply clang-format.
vyasr Nov 17, 2021
8409283
Address review comments, actually commit unit test file
chirayuG-nvidia Nov 17, 2021
1af02fa
Remove unnecessary comment.
vyasr Nov 17, 2021
bb8c34d
Merge pull request #118 from vyasr/feature/static_map_insert_if
jrhemstad Nov 19, 2021
6433e8a
Merge pull request #119 from PointKernel/allocator-without-stream
jrhemstad Dec 2, 2021
ebcc3cf
Merge branch 'dev' into stream_param_static_map
jrhemstad Dec 2, 2021
58da1f4
Merge remote-tracking branch 'upstream/dev' into large-key-type
PointKernel Dec 3, 2021
ca68548
Resolve conflicts
PointKernel Dec 3, 2021
d4849f3
Minor CMake style fixes
PointKernel Dec 3, 2021
241f26f
Merge pull request #107 from PointKernel/large-key-type
PointKernel Dec 3, 2021
1e10b7d
Doc updates.
jrhemstad Dec 6, 2021
c8f042f
Add non-shared-memory pair retrieve APIs
PointKernel Dec 6, 2021
147634e
Add probing CG argument in non-shared-memory retrieve APIs
PointKernel Dec 6, 2021
6c44a88
Add non-shared-memory pair_retrieve kernel
PointKernel Dec 6, 2021
2d5d44b
Add non-shared-memory pair retrieve device impl functions
PointKernel Dec 7, 2021
8946c11
Fix typos
PointKernel Dec 7, 2021
071030a
Add public device functions
PointKernel Dec 7, 2021
5f24703
Use a workaround for output assignment
PointKernel Dec 7, 2021
76a6ba1
Merge pull request #113 from chirayuG-nvidia/stream_param_static_map
PointKernel Dec 7, 2021
6f041d0
Update doc
PointKernel Dec 7, 2021
777bfbc
Remove non-shmem pair retrieve kernel
PointKernel Dec 7, 2021
b787d85
Get rid of atomic counter argument for non-shmem pair retrieve
PointKernel Dec 7, 2021
3084f3e
Merge branch 'dev' into non-shared-memory-retrieve
PointKernel Dec 7, 2021
400e0e4
Use = operator for pair assignment
PointKernel Dec 7, 2021
0087120
Get rid of shared memory counter by using cg.ballot()
PointKernel Dec 8, 2021
50635ff
Force type conversion for iterator value_type
PointKernel Dec 8, 2021
600b8ab
Fix an output offset bug
PointKernel Dec 8, 2021
4da79c1
Rename utils hpp header
PointKernel Dec 9, 2021
53b0969
Add & use count_least_significant_bits
PointKernel Dec 9, 2021
c01d987
Add unit tests for non-shmem pair_retrieve
PointKernel Dec 9, 2021
34a77c1
Update docs
PointKernel Dec 9, 2021
8918f18
Code formatting
PointKernel Dec 9, 2021
25a37a6
Merge pull request #125 from PointKernel/code-formatting
PointKernel Dec 9, 2021
de12659
Non-shmem pair_retrieve take 4 output iterators
PointKernel Dec 11, 2021
1c836c7
Use const variable whenever possible in retrieve
PointKernel Dec 11, 2021
b9ece83
Make sentinel getters part of public device_view API.
vyasr Dec 11, 2021
57986f4
Merge pull request #126 from vyasr/feature/expose_sentinels_on_device
jrhemstad Dec 13, 2021
cf1a337
Remove leftovers
PointKernel Dec 14, 2021
809a5f4
Update docs
PointKernel Dec 14, 2021
327435e
Update docs
PointKernel Dec 14, 2021
8187626
Fix a bug: expose public device APIs
PointKernel Dec 14, 2021
d473fac
Merge pull request #127 from PointKernel/expose-device-public-apis
PointKernel Dec 15, 2021
193de1a
Merge pull request #122 from PointKernel/non-shared-memory-retrieve
vyasr Dec 16, 2021
8c7806d
Merge branch 'reduce-compilation-time' of github.com:PointKernel/cuCo…
PointKernel Jan 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ add_library(cuco::cuco ALIAS cuco)
target_include_directories(cuco INTERFACE
INTERFACE $<BUILD_INTERFACE:${CUCO_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)
target_link_libraries(cuco INTERFACE libcudacxx CUDA::toolkit $<BUILD_INTERFACE:cuco::Thrust>)
target_link_libraries(cuco INTERFACE libcudacxx::libcudacxx CUDA::toolkit $<BUILD_INTERFACE:cuco::Thrust>)
target_compile_features(cuco INTERFACE cxx_std_17 cuda_std_17)

###################################################################################################
Expand Down Expand Up @@ -108,7 +108,7 @@ endif(BUILD_EXAMPLES)
###################################################################################################
# - Install targets -------------------------------------------------------------------------------

install(TARGETS cuco libcudacxx EXPORT cuco-exports)
install(TARGETS cuco EXPORT cuco-exports)
install(DIRECTORY include/cuco/ DESTINATION include/cuco)
install(FILES ${CUCO_BINARY_DIR}/include/cuco/version_config.hpp DESTINATION include/cuco)

Expand All @@ -131,7 +131,7 @@ structures tailored for efficient use with GPUs.
rapids_export(
INSTALL cuco
EXPORT_SET cuco-exports
GLOBAL_TARGETS cuco libcudacxx
GLOBAL_TARGETS cuco
NAMESPACE cuco::
DOCUMENTATION doc_string)

Expand All @@ -146,7 +146,7 @@ endif()
rapids_export(
BUILD cuco
EXPORT_SET cuco-exports
GLOBAL_TARGETS cuco libcudacxx
GLOBAL_TARGETS cuco
NAMESPACE cuco::
DOCUMENTATION doc_string
FINAL_CODE_BLOCK code_string)
100 changes: 49 additions & 51 deletions benchmarks/hash_table/dynamic_map_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,120 +15,118 @@
*/

#include <benchmark/benchmark.h>
#include <synchronization.hpp>
#include <cuco/dynamic_map.cuh>
#include <iostream>
#include <random>
#include <synchronization.hpp>

enum class dist_type {
UNIQUE,
UNIFORM,
GAUSSIAN
};
enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN };

template<dist_type Dist, typename Key, typename OutputIt>
static void generate_keys(OutputIt output_begin, OutputIt output_end) {
template <dist_type Dist, typename Key, typename OutputIt>
static void generate_keys(OutputIt output_begin, OutputIt output_end)
{
auto num_keys = std::distance(output_begin, output_end);

std::random_device rd;
std::mt19937 gen{rd()};

switch(Dist) {
switch (Dist) {
case dist_type::UNIQUE:
for(auto i = 0; i < num_keys; ++i) {
for (auto i = 0; i < num_keys; ++i) {
output_begin[i] = i;
}
break;
case dist_type::UNIFORM:
for(auto i = 0; i < num_keys; ++i) {
for (auto i = 0; i < num_keys; ++i) {
output_begin[i] = std::abs(static_cast<Key>(gen()));
}
break;
case dist_type::GAUSSIAN:
std::normal_distribution<> dg{1e9, 1e7};
for(auto i = 0; i < num_keys; ++i) {
for (auto i = 0; i < num_keys; ++i) {
output_begin[i] = std::abs(static_cast<Key>(dg(gen)));
}
break;
}
}

static void gen_final_size(benchmark::internal::Benchmark* b) {
for(auto size = 10'000'000; size <= 150'000'000; size += 20'000'000) {
static void gen_final_size(benchmark::internal::Benchmark* b)
{
for (auto size = 10'000'000; size <= 150'000'000; size += 20'000'000) {
b->Args({size});
}
}

template <typename Key, typename Value, dist_type Dist>
static void BM_dynamic_insert(::benchmark::State& state) {
static void BM_dynamic_insert(::benchmark::State& state)
{
using map_type = cuco::dynamic_map<Key, Value>;
std::size_t num_keys = state.range(0);
std::size_t initial_size = 1<<27;
std::vector<Key> h_keys( num_keys );
std::vector<cuco::pair_type<Key, Value>> h_pairs ( num_keys );

std::size_t num_keys = state.range(0);
std::size_t initial_size = 1 << 27;

std::vector<Key> h_keys(num_keys);
std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);

generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());

for(auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;
for (auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;
h_pairs[i].second = val;
}

thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs( h_pairs );
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);

std::size_t batch_size = 1E6;
for(auto _ : state) {
for (auto _ : state) {
map_type map{initial_size, -1, -1};
{
cuda_event_timer raii{state};
for(auto i = 0; i < num_keys; i += batch_size) {
cuda_event_timer raii{state};
for (auto i = 0; i < num_keys; i += batch_size) {
map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
}
}
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) *
int64_t(state.iterations()) *
state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
int64_t(state.range(0)));
}

template <typename Key, typename Value, dist_type Dist>
static void BM_dynamic_search_all(::benchmark::State& state) {
static void BM_dynamic_search_all(::benchmark::State& state)
{
using map_type = cuco::dynamic_map<Key, Value>;

std::size_t num_keys = state.range(0);
std::size_t initial_size = 1<<27;

std::vector<Key> h_keys( num_keys );
std::vector<cuco::pair_type<Key, Value>> h_pairs ( num_keys );
std::size_t num_keys = state.range(0);
std::size_t initial_size = 1 << 27;

std::vector<Key> h_keys(num_keys);
std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);

generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
for(auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;

for (auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;
h_pairs[i].second = val;
}

thrust::device_vector<Key> d_keys( h_keys );
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs( h_pairs );
thrust::device_vector<Value> d_results( num_keys );
thrust::device_vector<Key> d_keys(h_keys);
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
thrust::device_vector<Value> d_results(num_keys);

map_type map{initial_size, -1, -1};
map.insert(d_pairs.begin(), d_pairs.end());

for(auto _ : state) {
for (auto _ : state) {
cuda_event_timer raii{state};
map.find(d_keys.begin(), d_keys.end(), d_results.begin());
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) *
int64_t(state.iterations()) *
state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
int64_t(state.range(0)));
}

Expand Down Expand Up @@ -161,7 +159,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
Expand Down
105 changes: 49 additions & 56 deletions benchmarks/hash_table/static_map_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,40 +15,37 @@
*/

#include <benchmark/benchmark.h>
#include "cuco/static_map.cuh"
#include <thrust/for_each.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/for_each.h>
#include <fstream>
#include <iostream>
#include <random>
#include "cuco/static_map.cuh"

enum class dist_type {
UNIQUE,
UNIFORM,
GAUSSIAN
};
enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN };

template<dist_type Dist, typename Key, typename OutputIt>
static void generate_keys(OutputIt output_begin, OutputIt output_end) {
template <dist_type Dist, typename Key, typename OutputIt>
static void generate_keys(OutputIt output_begin, OutputIt output_end)
{
auto num_keys = std::distance(output_begin, output_end);

std::random_device rd;
std::mt19937 gen{rd()};

switch(Dist) {
switch (Dist) {
case dist_type::UNIQUE:
for(auto i = 0; i < num_keys; ++i) {
for (auto i = 0; i < num_keys; ++i) {
output_begin[i] = i;
}
break;
case dist_type::UNIFORM:
for(auto i = 0; i < num_keys; ++i) {
for (auto i = 0; i < num_keys; ++i) {
output_begin[i] = std::abs(static_cast<Key>(gen()));
}
break;
case dist_type::GAUSSIAN:
std::normal_distribution<> dg{1e9, 1e7};
for(auto i = 0; i < num_keys; ++i) {
for (auto i = 0; i < num_keys; ++i) {
output_begin[i] = std::abs(static_cast<Key>(dg(gen)));
}
break;
Expand All @@ -59,39 +56,39 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) {
* @brief Generates input sizes and hash table occupancies
*
*/
static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) {
static void generate_size_and_occupancy(benchmark::internal::Benchmark* b)
{
for (auto size = 100'000'000; size <= 100'000'000; size *= 10) {
for (auto occupancy = 10; occupancy <= 90; occupancy += 10) {
b->Args({size, occupancy});
}
}
}



template <typename Key, typename Value, dist_type Dist>
static void BM_static_map_insert(::benchmark::State& state) {
static void BM_static_map_insert(::benchmark::State& state)
{
using map_type = cuco::static_map<Key, Value>;

std::size_t num_keys = state.range(0);
float occupancy = state.range(1) / float{100};
std::size_t size = num_keys / occupancy;
float occupancy = state.range(1) / float{100};
std::size_t size = num_keys / occupancy;

std::vector<Key> h_keys(num_keys);
std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);

std::vector<Key> h_keys( num_keys );
std::vector<cuco::pair_type<Key, Value>> h_pairs( num_keys );

generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
for(auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;

for (auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;
h_pairs[i].second = val;
}

thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs( h_pairs );
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);

for(auto _ : state) {
for (auto _ : state) {
state.ResumeTiming();
state.PauseTiming();
map_type map{size, -1, -1};
Expand All @@ -102,54 +99,50 @@ static void BM_static_map_insert(::benchmark::State& state) {
state.PauseTiming();
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) *
int64_t(state.iterations()) *
state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
int64_t(state.range(0)));
}



template <typename Key, typename Value, dist_type Dist>
static void BM_static_map_search_all(::benchmark::State& state) {
static void BM_static_map_search_all(::benchmark::State& state)
{
using map_type = cuco::static_map<Key, Value>;

std::size_t num_keys = state.range(0);
float occupancy = state.range(1) / float{100};
std::size_t size = num_keys / occupancy;
float occupancy = state.range(1) / float{100};
std::size_t size = num_keys / occupancy;

map_type map{size, -1, -1};
auto view = map.get_device_mutable_view();

std::vector<Key> h_keys( num_keys );
std::vector<Value> h_values( num_keys );
std::vector<cuco::pair_type<Key, Value>> h_pairs ( num_keys );
std::vector<Value> h_results (num_keys);
std::vector<Key> h_keys(num_keys);
std::vector<Value> h_values(num_keys);
std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
std::vector<Value> h_results(num_keys);

generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
for(auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;

for (auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;
h_pairs[i].second = val;
}

thrust::device_vector<Key> d_keys( h_keys );
thrust::device_vector<Value> d_results( num_keys);
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs( h_pairs );
thrust::device_vector<Key> d_keys(h_keys);
thrust::device_vector<Value> d_results(num_keys);
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);

map.insert(d_pairs.begin(), d_pairs.end());
for(auto _ : state) {

for (auto _ : state) {
map.find(d_keys.begin(), d_keys.end(), d_results.begin());
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
int64_t(state.range(0)));
}



BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(generate_size_and_occupancy);
Expand Down
Loading