From 8baac4aa8315ff4373d3aa1e32c28886f3132cc1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 24 Sep 2021 23:21:22 -0400 Subject: [PATCH 01/70] Add more unit tests for custom key types --- tests/static_map/static_map_test.cu | 107 +++++++++++++++++++--------- 1 file changed, 72 insertions(+), 35 deletions(-) diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index 779305996..0c8d64145 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -81,25 +81,63 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) // User-defined key type // Need to specify alignment to WAR libcu++ bug where cuda::atomic fails for underaligned types: // https://github.com/NVIDIA/libcudacxx/issues/160 -struct alignas(8) key_pair { - int32_t a; - int32_t b; +template +struct alignas(8) key_pair_type { + T a; + T b; + + __host__ __device__ key_pair_type() {} + __host__ __device__ key_pair_type(T x) : a{x}, b{x} {} + + __device__ bool operator==(key_pair_type const& other) const + { + return a == other.a and b == other.b; + } +}; + +template +struct large_key_type { + T a; + T b; + T c; + + __host__ __device__ large_key_type() {} + __host__ __device__ large_key_type(T x) : a{x}, b{x}, c{x} {} + + __device__ bool operator==(large_key_type const& other) const + { + return a == other.a and b == other.b and c == other.c; + } }; -struct hash_key_pair { - __device__ uint32_t operator()(key_pair k) { return k.a; }; +struct hash_custom_key { + template + __device__ uint32_t operator()(custom_type k) + { + return k.a; + }; }; -struct key_pair_equals { - __device__ bool operator()(key_pair lhs, key_pair rhs) +struct custom_key_equals { + template + __device__ bool operator()(custom_type lhs, custom_type rhs) { return std::tie(lhs.a, lhs.b) == std::tie(rhs.a, rhs.b); } }; -struct alignas(8) value_pair { - int32_t f; - int32_t s; +template +struct alignas(8) value_pair_type { + T f; + T s; + + __host__ __device__ value_pair_type() {} + __host__ __device__ value_pair_type(T x) : f{x}, s{x} {} + + __device__ bool operator==(value_pair_type const& other) const + { + return f == other.f and s == other.s; + } }; #define SIZE 10 @@ -110,17 +148,20 @@ struct custom_equals { __device__ bool operator()(T lhs, T rhs) { return A[lhs] == A[rhs]; } }; -TEST_CASE("User defined key and value type", "") +TEMPLATE_TEST_CASE_SIG("User defined key and value type", + "", + ((typename Key, typename Value), Key, Value), + (key_pair_type, value_pair_type), + (key_pair_type, value_pair_type), + (key_pair_type, value_pair_type), + (large_key_type, value_pair_type)) { - using Key = key_pair; - using Value = value_pair; - - auto constexpr sentinel_key = Key{-1, -1}; - auto constexpr sentinel_value = Value{-1, -1}; + auto const sentinel_key = Key{-1}; + auto const sentinel_value = Value{-1}; constexpr std::size_t num_pairs = 100; constexpr std::size_t capacity = num_pairs * 2; - cuco::static_map map{capacity, sentinel_key, sentinel_value}; + cuco::static_map map{capacity, sentinel_key, sentinel_value}; thrust::device_vector insert_keys(num_pairs); thrust::device_vector insert_values(num_pairs); @@ -129,17 +170,13 @@ TEST_CASE("User defined key and value type", "") thrust::counting_iterator(0), thrust::counting_iterator(num_pairs), insert_keys.begin(), - [] __device__(auto i) { - return Key{i, i}; - }); + [] __device__(auto i) { return Key{i}; }); thrust::transform(thrust::device, thrust::counting_iterator(0), thrust::counting_iterator(num_pairs), insert_values.begin(), - [] __device__(auto i) { - return Value{i, i}; - }); + [] __device__(auto i) { return Value{i}; }); auto insert_pairs = thrust::make_zip_iterator(thrust::make_tuple(insert_keys.begin(), insert_values.begin())); @@ -147,21 +184,21 @@ TEST_CASE("User defined key and value type", "") SECTION("All inserted keys-value pairs should be correctly recovered during find") { thrust::device_vector found_values(num_pairs); - map.insert(insert_pairs, insert_pairs + num_pairs, hash_key_pair{}, key_pair_equals{}); + map.insert(insert_pairs, insert_pairs + num_pairs, hash_custom_key{}, custom_key_equals{}); REQUIRE(num_pairs == map.get_size()); map.find(insert_keys.begin(), insert_keys.end(), found_values.begin(), - hash_key_pair{}, - key_pair_equals{}); + hash_custom_key{}, + custom_key_equals{}); REQUIRE(thrust::equal(thrust::device, insert_values.begin(), insert_values.end(), found_values.begin(), - [] __device__(value_pair lhs, value_pair rhs) { + [] __device__(Value lhs, Value rhs) { return std::tie(lhs.f, lhs.s) == std::tie(rhs.f, rhs.s); })); } @@ -169,12 +206,12 @@ TEST_CASE("User defined key and value type", "") SECTION("All inserted keys-value pairs should be contained") { thrust::device_vector contained(num_pairs); - map.insert(insert_pairs, insert_pairs + num_pairs, hash_key_pair{}, key_pair_equals{}); + map.insert(insert_pairs, insert_pairs + num_pairs, hash_custom_key{}, custom_key_equals{}); map.contains(insert_keys.begin(), insert_keys.end(), contained.begin(), - hash_key_pair{}, - key_pair_equals{}); + hash_custom_key{}, + custom_key_equals{}); REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool const& b) { return b; })); } @@ -184,8 +221,8 @@ TEST_CASE("User defined key and value type", "") map.contains(insert_keys.begin(), insert_keys.end(), contained.begin(), - hash_key_pair{}, - key_pair_equals{}); + hash_custom_key{}, + custom_key_equals{}); REQUIRE( none_of(contained.begin(), contained.end(), [] __device__(bool const& b) { return b; })); } @@ -196,7 +233,7 @@ TEST_CASE("User defined key and value type", "") REQUIRE(all_of(insert_pairs, insert_pairs + num_pairs, [m_view] __device__(cuco::pair_type const& pair) mutable { - return m_view.insert(pair, hash_key_pair{}, key_pair_equals{}); + return m_view.insert(pair, hash_custom_key{}, custom_key_equals{}); })); } @@ -208,7 +245,7 @@ TEST_CASE("User defined key and value type", "") REQUIRE(all_of(insert_pairs, insert_pairs + num_pairs, [view] __device__(cuco::pair_type const& pair) mutable { - return view.find(pair.first, hash_key_pair{}, key_pair_equals{}) == + return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end(); })); } @@ -219,7 +256,7 @@ TEST_CASE("User defined key and value type", "") REQUIRE(all_of(insert_pairs, insert_pairs + num_pairs, [view] __device__(cuco::pair_type const& pair) { - return view.find(pair.first, hash_key_pair{}, key_pair_equals{}) == + return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end(); })); } From 02029bbb066a70b9b1f56f7375c4c739eee8b2f2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 24 Sep 2021 23:32:08 -0400 Subject: [PATCH 02/70] Minor cleanup: renaming variables --- tests/static_map/static_map_test.cu | 30 ++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index 0c8d64145..b2fcf7e6a 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -159,22 +159,22 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", auto const sentinel_key = Key{-1}; auto const sentinel_value = Value{-1}; - constexpr std::size_t num_pairs = 100; - constexpr std::size_t capacity = num_pairs * 2; + constexpr std::size_t num = 100; + constexpr std::size_t capacity = num * 2; cuco::static_map map{capacity, sentinel_key, sentinel_value}; - thrust::device_vector insert_keys(num_pairs); - thrust::device_vector insert_values(num_pairs); + thrust::device_vector insert_keys(num); + thrust::device_vector insert_values(num); thrust::transform(thrust::device, thrust::counting_iterator(0), - thrust::counting_iterator(num_pairs), + thrust::counting_iterator(num), insert_keys.begin(), [] __device__(auto i) { return Key{i}; }); thrust::transform(thrust::device, thrust::counting_iterator(0), - thrust::counting_iterator(num_pairs), + thrust::counting_iterator(num), insert_values.begin(), [] __device__(auto i) { return Value{i}; }); @@ -183,10 +183,10 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", SECTION("All inserted keys-value pairs should be correctly recovered during find") { - thrust::device_vector found_values(num_pairs); - map.insert(insert_pairs, insert_pairs + num_pairs, hash_custom_key{}, custom_key_equals{}); + thrust::device_vector found_values(num); + map.insert(insert_pairs, insert_pairs + num, hash_custom_key{}, custom_key_equals{}); - REQUIRE(num_pairs == map.get_size()); + REQUIRE(num == map.get_size()); map.find(insert_keys.begin(), insert_keys.end(), @@ -205,8 +205,8 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", SECTION("All inserted keys-value pairs should be contained") { - thrust::device_vector contained(num_pairs); - map.insert(insert_pairs, insert_pairs + num_pairs, hash_custom_key{}, custom_key_equals{}); + thrust::device_vector contained(num); + map.insert(insert_pairs, insert_pairs + num, hash_custom_key{}, custom_key_equals{}); map.contains(insert_keys.begin(), insert_keys.end(), contained.begin(), @@ -217,7 +217,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", SECTION("Non-inserted keys-value pairs should not be contained") { - thrust::device_vector contained(num_pairs); + thrust::device_vector contained(num); map.contains(insert_keys.begin(), insert_keys.end(), contained.begin(), @@ -231,7 +231,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", { auto m_view = map.get_device_mutable_view(); REQUIRE(all_of(insert_pairs, - insert_pairs + num_pairs, + insert_pairs + num, [m_view] __device__(cuco::pair_type const& pair) mutable { return m_view.insert(pair, hash_custom_key{}, custom_key_equals{}); })); @@ -243,7 +243,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", { auto view = map.get_device_view(); REQUIRE(all_of(insert_pairs, - insert_pairs + num_pairs, + insert_pairs + num, [view] __device__(cuco::pair_type const& pair) mutable { return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end(); @@ -254,7 +254,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", { auto const view = map.get_device_view(); REQUIRE(all_of(insert_pairs, - insert_pairs + num_pairs, + insert_pairs + num, [view] __device__(cuco::pair_type const& pair) { return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end(); From 143a5c5d4b1bdc5a3f72debea6ea32178c564049 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 26 Sep 2021 15:40:21 -0400 Subject: [PATCH 03/70] Add custom type example --- examples/CMakeLists.txt | 1 + examples/static_map/custom_type.cu | 105 +++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 examples/static_map/custom_type.cu diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index f8598900b..88db15711 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -18,3 +18,4 @@ endfunction(ConfigureExample) ################################################################################################### ConfigureExample(STATIC_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_example.cu") +ConfigureExample(CUSTOM_TYPE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type.cu") diff --git a/examples/static_map/custom_type.cu b/examples/static_map/custom_type.cu new file mode 100644 index 000000000..bc7798e72 --- /dev/null +++ b/examples/static_map/custom_type.cu @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +// User-defined key type +struct custom_key_type { + int32_t a; + int32_t b; + int32_t c; + + __host__ __device__ custom_key_type() {} + __host__ __device__ custom_key_type(int32_t x) : a{x}, b{x}, c{x} {} + + // Device equality operator is mandatory + __device__ bool operator==(custom_key_type const& other) const + { + return a == other.a and b == other.b and c == other.c; + } +}; + +// User-defined value type +// Manual alignment required due to WAR libcu++ bug where cuda::atomic fails for underaligned types +struct alignas(8) custom_value_type { + int32_t f; + int32_t s; + + __host__ __device__ custom_value_type() {} + __host__ __device__ custom_value_type(int32_t x) : f{x}, s{x} {} +}; + +// User-defined device hash callable +struct custom_hash { + __device__ uint32_t operator()(custom_key_type k) { return k.a; }; +}; + +// User-defined device key equal callable +struct custom_key_equals { + __device__ bool operator()(custom_key_type const& lhs, custom_key_type const& rhs) + { + return std::tie(lhs.a, lhs.b, lhs.c) == std::tie(rhs.a, rhs.b, rhs.c); + } +}; + +int main(void) +{ + constexpr std::size_t num_pairs = 80'000; + + // Set emtpy sentinels + auto const empty_key_sentinel = custom_key_type{-1}; + auto const empty_value_sentinel = custom_value_type{-1}; + + thrust::device_vector> pairs(num_pairs); + // Create a sequence of 80'000 pairs + thrust::transform( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_pairs), + pairs.begin(), + [] __device__(auto i) { return thrust::make_pair(custom_key_type{i}, custom_value_type{i}); }); + + // Construct a map with 100,000 slots using the given empty key/value sentinels. Note the + // capacity is chosen knowing we will insert 80,000 keys, for an load factor of 80%. + cuco::static_map map{ + 100'000, empty_key_sentinel, empty_value_sentinel}; + + // Inserts all pairs into the map by using the custom hasher and custom equality callable + map.insert(pairs.begin(), pairs.end(), custom_hash{}, custom_key_equals{}); + + // Reproduce inserted keys + auto insert_keys = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return custom_key_type{i}; }); + + thrust::device_vector contained(num_pairs); + + // Determine if all the inserted keys can be found by using the same hasher and equality + // function as `insert`. If a key `insert_keys[i]` doesn't exist, `contained[i] == false`. + map.contains( + insert_keys, insert_keys + num_pairs, contained.begin(), custom_hash{}, custom_key_equals{}); + // This will fail due to inconsistent hash and key equal. + // map.contains(insert_keys, insert_keys + num_pairs, contained.begin()); + + // All inserted keys are contained + assert( + thrust::all_of(contained.begin(), contained.end(), [] __device__(auto const& b) { return b; })); + + return 0; +} From 1eed520e858fc81f02e9b59f17de0b2a432439cf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 26 Sep 2021 17:03:16 -0400 Subject: [PATCH 04/70] Get rid of the shared variable warning in single map tests --- tests/static_map/static_map_test.cu | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index b2fcf7e6a..6cdac6d3a 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,21 @@ * limitations under the License. */ +#include +#include + #include #include #include #include -#include + #include + #include -#include + +// Disable warning for `__shared__ cuda::atomic`: "dynamic initialization is not supported +// for a function-scope static __shared__ variable within a __device__/__global__ function" +#pragma diag_suppress static_var_with_dynamic_init namespace { namespace cg = cooperative_groups; From 262b20146701ac00cbe083b546e781737b0da50b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 26 Sep 2021 19:50:50 -0400 Subject: [PATCH 05/70] Cleanups + use thrust logicals --- tests/static_map/static_map_test.cu | 65 +++++++++++------------------ 1 file changed, 24 insertions(+), 41 deletions(-) diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index 6cdac6d3a..d3f69271e 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -14,12 +14,11 @@ * limitations under the License. */ -#include #include -#include #include #include +#include #include #include @@ -30,31 +29,8 @@ // for a function-scope static __shared__ variable within a __device__/__global__ function" #pragma diag_suppress static_var_with_dynamic_init -namespace { namespace cg = cooperative_groups; -// Thrust logical algorithms (any_of/all_of/none_of) don't work with device -// lambdas: See https://github.com/thrust/thrust/issues/1062 -template -bool all_of(Iterator begin, Iterator end, Predicate p) -{ - auto size = thrust::distance(begin, end); - return size == thrust::count_if(begin, end, p); -} - -template -bool any_of(Iterator begin, Iterator end, Predicate p) -{ - return thrust::count_if(begin, end, p) > 0; -} - -template -bool none_of(Iterator begin, Iterator end, Predicate p) -{ - return not all_of(begin, end, p); -} -} // namespace - enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; template @@ -102,6 +78,7 @@ struct alignas(8) key_pair_type { } }; +// User-defined key type template struct large_key_type { T a; @@ -117,6 +94,22 @@ struct large_key_type { } }; +// User-defined value type +template +struct alignas(8) value_pair_type { + T f; + T s; + + __host__ __device__ value_pair_type() {} + __host__ __device__ value_pair_type(T x) : f{x}, s{x} {} + + __device__ bool operator==(value_pair_type const& other) const + { + return f == other.f and s == other.s; + } +}; + +// User-defined device hasher struct hash_custom_key { template __device__ uint32_t operator()(custom_type k) @@ -125,6 +118,7 @@ struct hash_custom_key { }; }; +// User-defined device key equality struct custom_key_equals { template __device__ bool operator()(custom_type lhs, custom_type rhs) @@ -133,20 +127,6 @@ struct custom_key_equals { } }; -template -struct alignas(8) value_pair_type { - T f; - T s; - - __host__ __device__ value_pair_type() {} - __host__ __device__ value_pair_type(T x) : f{x}, s{x} {} - - __device__ bool operator==(value_pair_type const& other) const - { - return f == other.f and s == other.s; - } -}; - #define SIZE 10 __device__ int A[SIZE]; @@ -158,10 +138,13 @@ struct custom_equals { TEMPLATE_TEST_CASE_SIG("User defined key and value type", "", ((typename Key, typename Value), Key, Value), - (key_pair_type, value_pair_type), +// A key/value type larger than 8B is supported only for sm_70 and up +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) (key_pair_type, value_pair_type), (key_pair_type, value_pair_type), - (large_key_type, value_pair_type)) + (large_key_type, value_pair_type), +#endif + (key_pair_type, value_pair_type)) { auto const sentinel_key = Key{-1}; auto const sentinel_value = Value{-1}; From ec5718056fd60069fb70d97c17913e881b44f058 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 26 Sep 2021 20:39:08 -0400 Subject: [PATCH 06/70] Update example to build for pascal --- examples/static_map/custom_type.cu | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/examples/static_map/custom_type.cu b/examples/static_map/custom_type.cu index bc7798e72..9e3c918f9 100644 --- a/examples/static_map/custom_type.cu +++ b/examples/static_map/custom_type.cu @@ -21,6 +21,22 @@ #include // User-defined key type +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) +struct custom_key_type { + int32_t a; + int32_t b; + + __host__ __device__ custom_key_type() {} + __host__ __device__ custom_key_type(int32_t x) : a{x}, b{x} {} + + // Device equality operator is mandatory + __device__ bool operator==(custom_key_type const& other) const + { + return a == other.a and b == other.b; + } +}; +#else +// Key type larger than 8B only supported for sm_70 and up struct custom_key_type { int32_t a; int32_t b; @@ -35,6 +51,7 @@ struct custom_key_type { return a == other.a and b == other.b and c == other.c; } }; +#endif // User-defined value type // Manual alignment required due to WAR libcu++ bug where cuda::atomic fails for underaligned types @@ -55,7 +72,7 @@ struct custom_hash { struct custom_key_equals { __device__ bool operator()(custom_key_type const& lhs, custom_key_type const& rhs) { - return std::tie(lhs.a, lhs.b, lhs.c) == std::tie(rhs.a, rhs.b, rhs.c); + return lhs.a == rhs.a; } }; From 8d6521c56fc25093c6aaba7f73fbf4488df0ee54 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 26 Sep 2021 20:50:31 -0400 Subject: [PATCH 07/70] Add alignment for custom pair type --- examples/static_map/custom_type.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/static_map/custom_type.cu b/examples/static_map/custom_type.cu index 9e3c918f9..df4e8c647 100644 --- a/examples/static_map/custom_type.cu +++ b/examples/static_map/custom_type.cu @@ -22,7 +22,8 @@ // User-defined key type #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) -struct custom_key_type { +// Manual alignment required due to WAR libcu++ bug where cuda::atomic fails for underaligned types +struct alignas(8) custom_key_type { int32_t a; int32_t b; From 5ac3ff2091fe380f60ff4caaa4c86d20ccc47b8f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Sep 2021 18:42:02 -0400 Subject: [PATCH 08/70] Update test cmake files: license + cleanups --- tests/CMakeLists.txt | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 40bd2b30a..f14b3d221 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,3 +1,18 @@ +#============================================================================= +# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= cmake_minimum_required(VERSION 3.18 FATAL_ERROR) include(CTest) @@ -23,7 +38,7 @@ target_link_libraries(CatchMain Catch2::Catch2) ################################################################################################### function(ConfigureTest TEST_NAME TEST_SRC) add_executable(${TEST_NAME} - "${TEST_SRC}" + ${TEST_SRC} $) # Link in the CatchMain object file target_link_libraries(${TEST_NAME} Catch2::Catch2 cuco CUDA::cudart) set_target_properties(${TEST_NAME} PROPERTIES @@ -35,13 +50,13 @@ endfunction(ConfigureTest) ################################################################################################### ### test sources ################################################################################## ################################################################################################### -set(STATIC_MAP_TEST_SRC - "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_test.cu") -ConfigureTest(STATIC_MAP_TEST "${STATIC_MAP_TEST_SRC}") -#################################################################################################### -set(DYNAMIC_MAP_TEST_SRC - "${CMAKE_CURRENT_SOURCE_DIR}/dynamic_map/dynamic_map_test.cu") +################################################################################################### +# - static_map tests ------------------------------------------------------------------------------ +ConfigureTest(STATIC_MAP_TEST + static_map/static_map_test.cu) -ConfigureTest(DYNAMIC_MAP_TEST "${DYNAMIC_MAP_TEST_SRC}") -#################################################################################################### +################################################################################################### +# - static_map tests ------------------------------------------------------------------------------ +ConfigureTest(DYNAMIC_MAP_TEST + dynamic_map/dynamic_map_test.cu) From 762c74a9d7903f547831f38a994bdaf9ece21bd9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Sep 2021 19:09:42 -0400 Subject: [PATCH 09/70] Add util header --- tests/util.hpp | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 tests/util.hpp diff --git a/tests/util.hpp b/tests/util.hpp new file mode 100644 index 000000000..44d4b8bc3 --- /dev/null +++ b/tests/util.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cg = cooperative_groups; + +// User-defined logical algorithms to reduce compilation time +template +bool all_of(Iterator begin, Iterator end, Predicate p) +{ + auto size = thrust::distance(begin, end); + return size == thrust::count_if(begin, end, p); +} + +template +bool any_of(Iterator begin, Iterator end, Predicate p) +{ + return thrust::count_if(begin, end, p) > 0; +} + +template +bool none_of(Iterator begin, Iterator end, Predicate p) +{ + return not all_of(begin, end, p); +} From 1b5440bf7064db1d39b860760591708fba5fc251 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 28 Sep 2021 19:10:40 -0400 Subject: [PATCH 10/70] Include util header in tests --- tests/CMakeLists.txt | 1 + tests/dynamic_map/dynamic_map_test.cu | 36 ++++++--------------------- tests/static_map/static_map_test.cu | 33 ++++-------------------- 3 files changed, 13 insertions(+), 57 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f14b3d221..8c0167c4f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -41,6 +41,7 @@ function(ConfigureTest TEST_NAME TEST_SRC) ${TEST_SRC} $) # Link in the CatchMain object file target_link_libraries(${TEST_NAME} Catch2::Catch2 cuco CUDA::cudart) + target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) set_target_properties(${TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests") target_compile_options(${TEST_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) diff --git a/tests/dynamic_map/dynamic_map_test.cu b/tests/dynamic_map/dynamic_map_test.cu index 3e4b94f02..1c377ca7e 100644 --- a/tests/dynamic_map/dynamic_map_test.cu +++ b/tests/dynamic_map/dynamic_map_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,15 @@ * limitations under the License. */ -#include +#include + #include #include -#include + #include #include -#include + +#include enum class dist_type { UNIQUE, @@ -55,30 +57,6 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) { } } -namespace { -// Thrust logical algorithms (any_of/all_of/none_of) don't work with device -// lambdas: See https://github.com/thrust/thrust/issues/1062 -template -bool all_of(Iterator begin, Iterator end, Predicate p) -{ - auto size = thrust::distance(begin, end); - return size == thrust::count_if(begin, end, p); -} - -template -bool any_of(Iterator begin, Iterator end, Predicate p) -{ - return thrust::count_if(begin, end, p) > 0; -} - -template -bool none_of(Iterator begin, Iterator end, Predicate p) -{ - return not all_of(begin, end, p); -} -} // namespace - - TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", ((typename T, dist_type Dist), T, Dist), (int32_t, dist_type::UNIQUE), (int64_t, dist_type::UNIQUE), @@ -145,4 +123,4 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", REQUIRE(none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); } -} \ No newline at end of file +} diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index 779305996..400aed446 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,39 +14,16 @@ * limitations under the License. */ -#include +#include + #include #include #include -#include + #include #include -#include -namespace { -namespace cg = cooperative_groups; - -// Thrust logical algorithms (any_of/all_of/none_of) don't work with device -// lambdas: See https://github.com/thrust/thrust/issues/1062 -template -bool all_of(Iterator begin, Iterator end, Predicate p) -{ - auto size = thrust::distance(begin, end); - return size == thrust::count_if(begin, end, p); -} - -template -bool any_of(Iterator begin, Iterator end, Predicate p) -{ - return thrust::count_if(begin, end, p) > 0; -} - -template -bool none_of(Iterator begin, Iterator end, Predicate p) -{ - return not all_of(begin, end, p); -} -} // namespace +#include enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; From d44f90d6c5684d1f7cf188b5e4c5f7ceae975a70 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 29 Sep 2021 12:18:48 -0400 Subject: [PATCH 11/70] Split tests into multiple files --- tests/CMakeLists.txt | 11 +- tests/dynamic_map/dynamic_map_test.cu | 126 ----- tests/dynamic_map/unique_sequence_test.cu | 89 ++++ tests/static_map/custom_type_test.cu | 263 ++++++++++ tests/static_map/shared_memory_test.cu | 234 +++++++++ tests/static_map/static_map_test.cu | 575 ---------------------- tests/static_map/unique_sequence_test.cu | 146 ++++++ 7 files changed, 738 insertions(+), 706 deletions(-) delete mode 100644 tests/dynamic_map/dynamic_map_test.cu create mode 100644 tests/dynamic_map/unique_sequence_test.cu create mode 100644 tests/static_map/custom_type_test.cu create mode 100644 tests/static_map/shared_memory_test.cu delete mode 100644 tests/static_map/static_map_test.cu create mode 100644 tests/static_map/unique_sequence_test.cu diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 8c0167c4f..2818959ae 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -36,9 +36,8 @@ add_library(CatchMain OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/catch_main.cpp) target_link_libraries(CatchMain Catch2::Catch2) ################################################################################################### -function(ConfigureTest TEST_NAME TEST_SRC) - add_executable(${TEST_NAME} - ${TEST_SRC} +function(ConfigureTest TEST_NAME) + add_executable(${TEST_NAME} ${ARGN} $) # Link in the CatchMain object file target_link_libraries(${TEST_NAME} Catch2::Catch2 cuco CUDA::cudart) target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) @@ -55,9 +54,11 @@ endfunction(ConfigureTest) ################################################################################################### # - static_map tests ------------------------------------------------------------------------------ ConfigureTest(STATIC_MAP_TEST - static_map/static_map_test.cu) + static_map/custom_type_test.cu + static_map/unique_sequence_test.cu + static_map/shared_memory_test.cu) ################################################################################################### # - static_map tests ------------------------------------------------------------------------------ ConfigureTest(DYNAMIC_MAP_TEST - dynamic_map/dynamic_map_test.cu) + dynamic_map/unique_sequence_test.cu) diff --git a/tests/dynamic_map/dynamic_map_test.cu b/tests/dynamic_map/dynamic_map_test.cu deleted file mode 100644 index 1c377ca7e..000000000 --- a/tests/dynamic_map/dynamic_map_test.cu +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include - -#include -#include - -#include - -enum class dist_type { - UNIQUE, - UNIFORM, - GAUSSIAN -}; - -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) { - auto num_keys = std::distance(output_begin, output_end); - - std::random_device rd; - std::mt19937 gen{rd()}; - - switch(Dist) { - case dist_type::UNIQUE: - for(auto i = 0; i < num_keys; ++i) { - output_begin[i] = i; - } - break; - case dist_type::UNIFORM: - for(auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(gen())); - } - break; - case dist_type::GAUSSIAN: - std::normal_distribution<> dg{1e9, 1e7}; - for(auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(dg(gen))); - } - break; - } -} - -TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", - ((typename T, dist_type Dist), T, Dist), - (int32_t, dist_type::UNIQUE), (int64_t, dist_type::UNIQUE), - (int32_t, dist_type::UNIFORM), (int64_t, dist_type::UNIFORM), - (int32_t, dist_type::GAUSSIAN), (int64_t, dist_type::GAUSSIAN)) -{ - using Key = T; - using Value = T; - - constexpr std::size_t num_keys{50'000'000}; - cuco::dynamic_map map{30'000'000, -1, -1}; - - std::vector h_keys( num_keys ); - std::vector h_values( num_keys ); - std::vector> h_pairs ( num_keys ); - - generate_keys(h_keys.begin(), h_keys.end()); - - for(auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_values[i] = val; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector d_keys( h_keys ); - thrust::device_vector d_values( h_values ); - thrust::device_vector> d_pairs( h_pairs ); - thrust::device_vector d_results( num_keys ); - thrust::device_vector d_contained( num_keys ); - - // bulk function test cases - SECTION("All inserted keys-value pairs should be correctly recovered during find") - { - map.insert(d_pairs.begin(), d_pairs.end()); - map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); - - REQUIRE(all_of(zip, zip + num_keys, - [] __device__(auto const& p) { - return thrust::get<0>(p) == thrust::get<1>(p); - })); - } - - SECTION("All non-inserted keys-value pairs should have the empty sentinel value recovered") - { - map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - - REQUIRE(all_of(d_results.begin(), d_results.end(), [] __device__(auto const& p) { return p == -1; })); - } - - SECTION("All inserted keys-value pairs should be contained") - { - map.insert(d_pairs.begin(), d_pairs.end()); - map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - - REQUIRE(all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); - } - - SECTION("Non-inserted keys-value pairs should not be contained") - { - map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - - REQUIRE(none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); - } -} diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu new file mode 100644 index 000000000..d166ab0a7 --- /dev/null +++ b/tests/dynamic_map/unique_sequence_test.cu @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", + "", + ((typename Key, typename Value), Key, Value), + (int32_t, int32_t), + (int32_t, int64_t), + (int64_t, int32_t), + (int64_t, int64_t)) +{ + constexpr std::size_t num_keys{50'000'000}; + cuco::dynamic_map map{30'000'000, -1, -1}; + + thrust::device_vector d_keys(num_keys); + thrust::device_vector d_values(num_keys); + thrust::device_vector> d_pairs(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + thrust::sequence(thrust::device, d_values.begin(), d_values.end()); + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_keys), + d_pairs.begin(), + [] __device__(auto i) { + return cuco::pair_type{i, i}; + }); + + thrust::device_vector d_results(num_keys); + thrust::device_vector d_contained(num_keys); + + // bulk function test cases + SECTION("All inserted keys-value pairs should be correctly recovered during find") + { + map.insert(d_pairs.begin(), d_pairs.end()); + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); + + REQUIRE(all_of(zip, zip + num_keys, [] __device__(auto const& p) { + return thrust::get<0>(p) == thrust::get<1>(p); + })); + } + + SECTION("All non-inserted keys-value pairs should have the empty sentinel value recovered") + { + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + + REQUIRE( + all_of(d_results.begin(), d_results.end(), [] __device__(auto const& p) { return p == -1; })); + } + + SECTION("All inserted keys-value pairs should be contained") + { + map.insert(d_pairs.begin(), d_pairs.end()); + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); + + REQUIRE( + all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + } + + SECTION("Non-inserted keys-value pairs should not be contained") + { + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); + + REQUIRE( + none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + } +} diff --git a/tests/static_map/custom_type_test.cu b/tests/static_map/custom_type_test.cu new file mode 100644 index 000000000..8334809b9 --- /dev/null +++ b/tests/static_map/custom_type_test.cu @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; + +template +static void generate_keys(OutputIt output_begin, OutputIt output_end) +{ + auto num_keys = std::distance(output_begin, output_end); + + std::random_device rd; + std::mt19937 gen{rd()}; + + switch (Dist) { + case dist_type::UNIQUE: + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = i; + } + break; + case dist_type::UNIFORM: + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = std::abs(static_cast(gen())); + } + break; + case dist_type::GAUSSIAN: + std::normal_distribution<> dg{1e9, 1e7}; + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = std::abs(static_cast(dg(gen))); + } + break; + } +} + +// User-defined key type +// Need to specify alignment to WAR libcu++ bug where cuda::atomic fails for underaligned types: +// https://github.com/NVIDIA/libcudacxx/issues/160 +struct alignas(8) key_pair { + int32_t a; + int32_t b; +}; + +struct hash_key_pair { + __device__ uint32_t operator()(key_pair k) { return k.a; }; +}; + +struct key_pair_equals { + __device__ bool operator()(key_pair lhs, key_pair rhs) + { + return std::tie(lhs.a, lhs.b) == std::tie(rhs.a, rhs.b); + } +}; + +struct alignas(8) value_pair { + int32_t f; + int32_t s; +}; + +#define SIZE 10 +__device__ int A[SIZE]; + +template +struct custom_equals { + __device__ bool operator()(T lhs, T rhs) { return A[lhs] == A[rhs]; } +}; + +TEST_CASE("User defined key and value type", "") +{ + using Key = key_pair; + using Value = value_pair; + + auto constexpr sentinel_key = Key{-1, -1}; + auto constexpr sentinel_value = Value{-1, -1}; + + constexpr std::size_t num_pairs = 100; + constexpr std::size_t capacity = num_pairs * 2; + cuco::static_map map{capacity, sentinel_key, sentinel_value}; + + thrust::device_vector insert_keys(num_pairs); + thrust::device_vector insert_values(num_pairs); + + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_pairs), + insert_keys.begin(), + [] __device__(auto i) { + return Key{i, i}; + }); + + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_pairs), + insert_values.begin(), + [] __device__(auto i) { + return Value{i, i}; + }); + + auto insert_pairs = + thrust::make_zip_iterator(thrust::make_tuple(insert_keys.begin(), insert_values.begin())); + + SECTION("All inserted keys-value pairs should be correctly recovered during find") + { + thrust::device_vector found_values(num_pairs); + map.insert(insert_pairs, insert_pairs + num_pairs, hash_key_pair{}, key_pair_equals{}); + + REQUIRE(num_pairs == map.get_size()); + + map.find(insert_keys.begin(), + insert_keys.end(), + found_values.begin(), + hash_key_pair{}, + key_pair_equals{}); + + REQUIRE(thrust::equal(thrust::device, + insert_values.begin(), + insert_values.end(), + found_values.begin(), + [] __device__(value_pair lhs, value_pair rhs) { + return std::tie(lhs.f, lhs.s) == std::tie(rhs.f, rhs.s); + })); + } + + SECTION("All inserted keys-value pairs should be contained") + { + thrust::device_vector contained(num_pairs); + map.insert(insert_pairs, insert_pairs + num_pairs, hash_key_pair{}, key_pair_equals{}); + map.contains(insert_keys.begin(), + insert_keys.end(), + contained.begin(), + hash_key_pair{}, + key_pair_equals{}); + REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool const& b) { return b; })); + } + + SECTION("Non-inserted keys-value pairs should not be contained") + { + thrust::device_vector contained(num_pairs); + map.contains(insert_keys.begin(), + insert_keys.end(), + contained.begin(), + hash_key_pair{}, + key_pair_equals{}); + REQUIRE( + none_of(contained.begin(), contained.end(), [] __device__(bool const& b) { return b; })); + } + + SECTION("Inserting unique keys should return insert success.") + { + auto m_view = map.get_device_mutable_view(); + REQUIRE(all_of(insert_pairs, + insert_pairs + num_pairs, + [m_view] __device__(cuco::pair_type const& pair) mutable { + return m_view.insert(pair, hash_key_pair{}, key_pair_equals{}); + })); + } + + SECTION("Cannot find any key in an empty hash map") + { + SECTION("non-const view") + { + auto view = map.get_device_view(); + REQUIRE(all_of(insert_pairs, + insert_pairs + num_pairs, + [view] __device__(cuco::pair_type const& pair) mutable { + return view.find(pair.first, hash_key_pair{}, key_pair_equals{}) == + view.end(); + })); + } + + SECTION("const view") + { + auto const view = map.get_device_view(); + REQUIRE(all_of(insert_pairs, + insert_pairs + num_pairs, + [view] __device__(cuco::pair_type const& pair) { + return view.find(pair.first, hash_key_pair{}, key_pair_equals{}) == + view.end(); + })); + } + } +} + +TEMPLATE_TEST_CASE_SIG("Key comparison against sentinel", + "", + ((typename T, dist_type Dist), T, Dist), + (int32_t, dist_type::UNIQUE), + (int64_t, dist_type::UNIQUE)) +{ + using Key = T; + using Value = T; + + constexpr std::size_t num_keys{SIZE}; + cuco::static_map map{SIZE * 2, -1, -1}; + + auto m_view = map.get_device_mutable_view(); + auto view = map.get_device_view(); + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; + h_pairs[i].second = val; + } + + int h_A[SIZE]; + for (int i = 0; i < SIZE; i++) { + h_A[i] = i; + } + cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int)); + + thrust::device_vector> d_pairs(h_pairs); + + SECTION( + "Tests of non-CG insert: The custom `key_equal` can never be used to compare against sentinel") + { + REQUIRE(all_of(d_pairs.begin(), + d_pairs.end(), + [m_view] __device__(cuco::pair_type const& pair) mutable { + return m_view.insert( + pair, cuco::detail::MurmurHash3_32{}, custom_equals{}); + })); + } + + SECTION( + "Tests of CG insert: The custom `key_equal` can never be used to compare against sentinel") + { + map.insert( + d_pairs.begin(), d_pairs.end(), cuco::detail::MurmurHash3_32{}, custom_equals{}); + // All keys inserted via custom `key_equal` should be found + REQUIRE(all_of( + d_pairs.begin(), d_pairs.end(), [view] __device__(cuco::pair_type const& pair) { + auto const found = view.find(pair.first); + return (found != view.end()) and + (found->first.load() == pair.first and found->second.load() == pair.second); + })); + } +} diff --git a/tests/static_map/shared_memory_test.cu b/tests/static_map/shared_memory_test.cu new file mode 100644 index 000000000..52a86f862 --- /dev/null +++ b/tests/static_map/shared_memory_test.cu @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include + +#include + +enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; + +template +static void generate_keys(OutputIt output_begin, OutputIt output_end) +{ + auto num_keys = std::distance(output_begin, output_end); + + std::random_device rd; + std::mt19937 gen{rd()}; + + switch (Dist) { + case dist_type::UNIQUE: + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = i; + } + break; + case dist_type::UNIFORM: + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = std::abs(static_cast(gen())); + } + break; + case dist_type::GAUSSIAN: + std::normal_distribution<> dg{1e9, 1e7}; + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = std::abs(static_cast(dg(gen))); + } + break; + } +} + +template +__global__ void shared_memory_test_kernel( + typename MapType::device_view const* const device_views, + typename MapType::device_view::key_type const* const insterted_keys, + typename MapType::device_view::mapped_type const* const inserted_values, + const size_t number_of_elements, + bool* const keys_exist, + bool* const keys_and_values_correct) +{ + // Each block processes one map + const size_t map_id = blockIdx.x; + const size_t offset = map_id * number_of_elements; + + __shared__ typename MapType::pair_atomic_type sm_buffer[CAPACITY]; + + auto g = cg::this_thread_block(); + typename MapType::device_view sm_device_view = + MapType::device_view::make_copy(g, sm_buffer, device_views[map_id]); + + for (int i = g.thread_rank(); i < number_of_elements; i += g.size()) { + auto found_pair_it = sm_device_view.find(insterted_keys[offset + i]); + + if (found_pair_it != sm_device_view.end()) { + keys_exist[offset + i] = true; + if (found_pair_it->first == insterted_keys[offset + i] and + found_pair_it->second == inserted_values[offset + i]) { + keys_and_values_correct[offset + i] = true; + } else { + keys_and_values_correct[offset + i] = false; + } + } else { + keys_exist[offset + i] = false; + keys_and_values_correct[offset + i] = true; + } + } +} + +TEMPLATE_TEST_CASE_SIG("Shared memory static map", + "", + ((typename T, dist_type Dist), T, Dist), + (int32_t, dist_type::UNIQUE), + (int64_t, dist_type::UNIQUE), + (int32_t, dist_type::UNIFORM), + (int64_t, dist_type::UNIFORM), + (int32_t, dist_type::GAUSSIAN), + (int64_t, dist_type::GAUSSIAN)) +{ + using KeyType = T; + using ValueType = T; + using MapType = cuco::static_map; + using DeviceViewType = typename MapType::device_view; + using DeviceViewIteratorType = typename DeviceViewType::iterator; + + constexpr std::size_t number_of_maps = 1000; + constexpr std::size_t elements_in_map = 500; + constexpr std::size_t map_capacity = 2 * elements_in_map; + + // one array for all maps, first elements_in_map element belong to map 0, second to map 1 and so + // on + std::vector h_keys(number_of_maps * elements_in_map); + std::vector h_values(number_of_maps * elements_in_map); + std::vector> h_pairs(number_of_maps * elements_in_map); + + // using std::unique_ptr because static_map does not have copy/move constructor/assignment + // operator yet + std::vector> maps; + + for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { + const std::size_t offset = map_id * elements_in_map; + + generate_keys(h_keys.begin() + offset, + h_keys.begin() + offset + elements_in_map); + + for (std::size_t i = 0; i < elements_in_map; ++i) { + KeyType key = h_keys[offset + i]; + ValueType val = key < std::numeric_limits::max() ? key + 1 : 0; + h_values[offset + i] = val; + h_pairs[offset + i].first = key; + h_pairs[offset + i].second = val; + } + + maps.push_back(std::make_unique(map_capacity, -1, -1)); + } + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_values); + thrust::device_vector> d_pairs(h_pairs); + + SECTION("Keys are all found after insertion.") + { + std::vector h_device_views; + for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { + const std::size_t offset = map_id * elements_in_map; + + MapType* map = maps[map_id].get(); + map->insert(d_pairs.begin() + offset, d_pairs.begin() + offset + elements_in_map); + h_device_views.push_back(map->get_device_view()); + } + thrust::device_vector d_device_views(h_device_views); + + thrust::device_vector d_keys_exist(number_of_maps * elements_in_map); + thrust::device_vector d_keys_and_values_correct(number_of_maps * elements_in_map); + + shared_memory_test_kernel + <<>>(d_device_views.data().get(), + d_keys.data().get(), + d_values.data().get(), + elements_in_map, + d_keys_exist.data().get(), + d_keys_and_values_correct.data().get()); + + REQUIRE(d_keys_exist.size() == d_keys_and_values_correct.size()); + auto zip = thrust::make_zip_iterator( + thrust::make_tuple(d_keys_exist.begin(), d_keys_and_values_correct.begin())); + + REQUIRE(all_of(zip, zip + d_keys_exist.size(), [] __device__(auto const& z) { + return thrust::get<0>(z) and thrust::get<1>(z); + })); + } + + SECTION("No key is found before insertion.") + { + std::vector h_device_views; + for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { + h_device_views.push_back(maps[map_id].get()->get_device_view()); + } + thrust::device_vector d_device_views(h_device_views); + + thrust::device_vector d_keys_exist(number_of_maps * elements_in_map); + thrust::device_vector d_keys_and_values_correct(number_of_maps * elements_in_map); + + shared_memory_test_kernel + <<>>(d_device_views.data().get(), + d_keys.data().get(), + d_values.data().get(), + elements_in_map, + d_keys_exist.data().get(), + d_keys_and_values_correct.data().get()); + + REQUIRE(none_of(d_keys_exist.begin(), d_keys_exist.end(), [] __device__(const bool key_found) { + return key_found; + })); + } +} + +template +__global__ void shared_memory_hash_table_kernel(bool* key_found) +{ + namespace cg = cooperative_groups; + using map_type = typename cuco::static_map::device_mutable_view; + using find_map_type = typename cuco::static_map::device_view; + __shared__ typename map_type::slot_type slots[N]; + auto map = map_type::make_from_uninitialized_slots(cg::this_thread_block(), &slots[0], N, -1, -1); + + auto g = cg::this_thread_block(); + std::size_t index = threadIdx.x + blockIdx.x * blockDim.x; + int rank = g.thread_rank(); + + // insert {thread_rank, thread_rank} for each thread in thread-block + map.insert(cuco::pair(rank, rank)); + g.sync(); + + auto find_map = find_map_type(map); + auto retrieved_pair = find_map.find(rank); + if (retrieved_pair != find_map.end() && retrieved_pair->second == rank) { + key_found[index] = true; + } +} + +TEMPLATE_TEST_CASE("Shared memory slots.", "", int32_t) +{ + constexpr std::size_t N = 256; + thrust::device_vector key_found(N, false); + shared_memory_hash_table_kernel<<<8, 32>>>(key_found.data().get()); + + REQUIRE(all_of(key_found.begin(), key_found.end(), thrust::identity{})); +} diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu deleted file mode 100644 index 400aed446..000000000 --- a/tests/static_map/static_map_test.cu +++ /dev/null @@ -1,575 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include - -#include -#include - -#include - -enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; - -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) -{ - auto num_keys = std::distance(output_begin, output_end); - - std::random_device rd; - std::mt19937 gen{rd()}; - - switch (Dist) { - case dist_type::UNIQUE: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = i; - } - break; - case dist_type::UNIFORM: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(gen())); - } - break; - case dist_type::GAUSSIAN: - std::normal_distribution<> dg{1e9, 1e7}; - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(dg(gen))); - } - break; - } -} - -// User-defined key type -// Need to specify alignment to WAR libcu++ bug where cuda::atomic fails for underaligned types: -// https://github.com/NVIDIA/libcudacxx/issues/160 -struct alignas(8) key_pair { - int32_t a; - int32_t b; -}; - -struct hash_key_pair { - __device__ uint32_t operator()(key_pair k) { return k.a; }; -}; - -struct key_pair_equals { - __device__ bool operator()(key_pair lhs, key_pair rhs) - { - return std::tie(lhs.a, lhs.b) == std::tie(rhs.a, rhs.b); - } -}; - -struct alignas(8) value_pair { - int32_t f; - int32_t s; -}; - -#define SIZE 10 -__device__ int A[SIZE]; - -template -struct custom_equals { - __device__ bool operator()(T lhs, T rhs) { return A[lhs] == A[rhs]; } -}; - -TEST_CASE("User defined key and value type", "") -{ - using Key = key_pair; - using Value = value_pair; - - auto constexpr sentinel_key = Key{-1, -1}; - auto constexpr sentinel_value = Value{-1, -1}; - - constexpr std::size_t num_pairs = 100; - constexpr std::size_t capacity = num_pairs * 2; - cuco::static_map map{capacity, sentinel_key, sentinel_value}; - - thrust::device_vector insert_keys(num_pairs); - thrust::device_vector insert_values(num_pairs); - - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_pairs), - insert_keys.begin(), - [] __device__(auto i) { - return Key{i, i}; - }); - - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_pairs), - insert_values.begin(), - [] __device__(auto i) { - return Value{i, i}; - }); - - auto insert_pairs = - thrust::make_zip_iterator(thrust::make_tuple(insert_keys.begin(), insert_values.begin())); - - SECTION("All inserted keys-value pairs should be correctly recovered during find") - { - thrust::device_vector found_values(num_pairs); - map.insert(insert_pairs, insert_pairs + num_pairs, hash_key_pair{}, key_pair_equals{}); - - REQUIRE(num_pairs == map.get_size()); - - map.find(insert_keys.begin(), - insert_keys.end(), - found_values.begin(), - hash_key_pair{}, - key_pair_equals{}); - - REQUIRE(thrust::equal(thrust::device, - insert_values.begin(), - insert_values.end(), - found_values.begin(), - [] __device__(value_pair lhs, value_pair rhs) { - return std::tie(lhs.f, lhs.s) == std::tie(rhs.f, rhs.s); - })); - } - - SECTION("All inserted keys-value pairs should be contained") - { - thrust::device_vector contained(num_pairs); - map.insert(insert_pairs, insert_pairs + num_pairs, hash_key_pair{}, key_pair_equals{}); - map.contains(insert_keys.begin(), - insert_keys.end(), - contained.begin(), - hash_key_pair{}, - key_pair_equals{}); - REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool const& b) { return b; })); - } - - SECTION("Non-inserted keys-value pairs should not be contained") - { - thrust::device_vector contained(num_pairs); - map.contains(insert_keys.begin(), - insert_keys.end(), - contained.begin(), - hash_key_pair{}, - key_pair_equals{}); - REQUIRE( - none_of(contained.begin(), contained.end(), [] __device__(bool const& b) { return b; })); - } - - SECTION("Inserting unique keys should return insert success.") - { - auto m_view = map.get_device_mutable_view(); - REQUIRE(all_of(insert_pairs, - insert_pairs + num_pairs, - [m_view] __device__(cuco::pair_type const& pair) mutable { - return m_view.insert(pair, hash_key_pair{}, key_pair_equals{}); - })); - } - - SECTION("Cannot find any key in an empty hash map") - { - SECTION("non-const view") - { - auto view = map.get_device_view(); - REQUIRE(all_of(insert_pairs, - insert_pairs + num_pairs, - [view] __device__(cuco::pair_type const& pair) mutable { - return view.find(pair.first, hash_key_pair{}, key_pair_equals{}) == - view.end(); - })); - } - - SECTION("const view") - { - auto const view = map.get_device_view(); - REQUIRE(all_of(insert_pairs, - insert_pairs + num_pairs, - [view] __device__(cuco::pair_type const& pair) { - return view.find(pair.first, hash_key_pair{}, key_pair_equals{}) == - view.end(); - })); - } - } -} - -TEMPLATE_TEST_CASE_SIG("Key comparison against sentinel", - "", - ((typename T, dist_type Dist), T, Dist), - (int32_t, dist_type::UNIQUE), - (int64_t, dist_type::UNIQUE)) -{ - using Key = T; - using Value = T; - - constexpr std::size_t num_keys{SIZE}; - cuco::static_map map{SIZE * 2, -1, -1}; - - auto m_view = map.get_device_mutable_view(); - auto view = map.get_device_view(); - - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - int h_A[SIZE]; - for (int i = 0; i < SIZE; i++) { - h_A[i] = i; - } - cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int)); - - thrust::device_vector> d_pairs(h_pairs); - - SECTION( - "Tests of non-CG insert: The custom `key_equal` can never be used to compare against sentinel") - { - REQUIRE(all_of(d_pairs.begin(), - d_pairs.end(), - [m_view] __device__(cuco::pair_type const& pair) mutable { - return m_view.insert( - pair, cuco::detail::MurmurHash3_32{}, custom_equals{}); - })); - } - - SECTION( - "Tests of CG insert: The custom `key_equal` can never be used to compare against sentinel") - { - map.insert( - d_pairs.begin(), d_pairs.end(), cuco::detail::MurmurHash3_32{}, custom_equals{}); - // All keys inserted via custom `key_equal` should be found - REQUIRE(all_of( - d_pairs.begin(), d_pairs.end(), [view] __device__(cuco::pair_type const& pair) { - auto const found = view.find(pair.first); - return (found != view.end()) and - (found->first.load() == pair.first and found->second.load() == pair.second); - })); - } -} - -TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", - "", - ((typename T, dist_type Dist), T, Dist), - (int32_t, dist_type::UNIQUE), - (int64_t, dist_type::UNIQUE), - (int32_t, dist_type::UNIFORM), - (int64_t, dist_type::UNIFORM), - (int32_t, dist_type::GAUSSIAN), - (int64_t, dist_type::GAUSSIAN)) -{ - using Key = T; - using Value = T; - - constexpr std::size_t num_keys{500'000}; - cuco::static_map map{1'000'000, -1, -1}; - - auto m_view = map.get_device_mutable_view(); - auto view = map.get_device_view(); - - std::vector h_keys(num_keys); - std::vector h_values(num_keys); - std::vector> h_pairs(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - h_values[i] = val; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector d_values(h_values); - thrust::device_vector> d_pairs(h_pairs); - thrust::device_vector d_results(num_keys); - thrust::device_vector d_contained(num_keys); - - // bulk function test cases - SECTION("All inserted keys-value pairs should be correctly recovered during find") - { - map.insert(d_pairs.begin(), d_pairs.end()); - map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); - - REQUIRE(all_of(zip, zip + num_keys, [] __device__(auto const& p) { - return thrust::get<0>(p) == thrust::get<1>(p); - })); - } - - SECTION("All inserted keys-value pairs should be contained") - { - map.insert(d_pairs.begin(), d_pairs.end()); - map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - - REQUIRE( - all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); - } - - SECTION("Non-inserted keys-value pairs should not be contained") - { - map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - - REQUIRE( - none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); - } - - SECTION("Inserting unique keys should return insert success.") - { - if (Dist == dist_type::UNIQUE) { - REQUIRE(all_of(d_pairs.begin(), - d_pairs.end(), - [m_view] __device__(cuco::pair_type const& pair) mutable { - return m_view.insert(pair); - })); - } - } - - SECTION("Cannot find any key in an empty hash map with non-const view") - { - SECTION("non-const view") - { - REQUIRE(all_of(d_pairs.begin(), - d_pairs.end(), - [view] __device__(cuco::pair_type const& pair) mutable { - return view.find(pair.first) == view.end(); - })); - } - SECTION("const view") - { - REQUIRE(all_of( - d_pairs.begin(), d_pairs.end(), [view] __device__(cuco::pair_type const& pair) { - return view.find(pair.first) == view.end(); - })); - } - } - - SECTION("Keys are all found after inserting many keys.") - { - // Bulk insert keys - thrust::for_each(thrust::device, - d_pairs.begin(), - d_pairs.end(), - [m_view] __device__(cuco::pair_type const& pair) mutable { - m_view.insert(pair); - }); - - SECTION("non-const view") - { - // All keys should be found - REQUIRE(all_of(d_pairs.begin(), - d_pairs.end(), - [view] __device__(cuco::pair_type const& pair) mutable { - auto const found = view.find(pair.first); - return (found != view.end()) and (found->first.load() == pair.first and - found->second.load() == pair.second); - })); - } - SECTION("const view") - { - // All keys should be found - REQUIRE(all_of( - d_pairs.begin(), d_pairs.end(), [view] __device__(cuco::pair_type const& pair) { - auto const found = view.find(pair.first); - return (found != view.end()) and - (found->first.load() == pair.first and found->second.load() == pair.second); - })); - } - } -} - -template -__global__ void shared_memory_test_kernel( - typename MapType::device_view const* const device_views, - typename MapType::device_view::key_type const* const insterted_keys, - typename MapType::device_view::mapped_type const* const inserted_values, - const size_t number_of_elements, - bool* const keys_exist, - bool* const keys_and_values_correct) -{ - // Each block processes one map - const size_t map_id = blockIdx.x; - const size_t offset = map_id * number_of_elements; - - __shared__ typename MapType::pair_atomic_type sm_buffer[CAPACITY]; - - auto g = cg::this_thread_block(); - typename MapType::device_view sm_device_view = - MapType::device_view::make_copy(g, sm_buffer, device_views[map_id]); - - for (int i = g.thread_rank(); i < number_of_elements; i += g.size()) { - auto found_pair_it = sm_device_view.find(insterted_keys[offset + i]); - - if (found_pair_it != sm_device_view.end()) { - keys_exist[offset + i] = true; - if (found_pair_it->first == insterted_keys[offset + i] and - found_pair_it->second == inserted_values[offset + i]) { - keys_and_values_correct[offset + i] = true; - } else { - keys_and_values_correct[offset + i] = false; - } - } else { - keys_exist[offset + i] = false; - keys_and_values_correct[offset + i] = true; - } - } -} - -TEMPLATE_TEST_CASE_SIG("Shared memory static map", - "", - ((typename T, dist_type Dist), T, Dist), - (int32_t, dist_type::UNIQUE), - (int64_t, dist_type::UNIQUE), - (int32_t, dist_type::UNIFORM), - (int64_t, dist_type::UNIFORM), - (int32_t, dist_type::GAUSSIAN), - (int64_t, dist_type::GAUSSIAN)) -{ - using KeyType = T; - using ValueType = T; - using MapType = cuco::static_map; - using DeviceViewType = typename MapType::device_view; - using DeviceViewIteratorType = typename DeviceViewType::iterator; - - constexpr std::size_t number_of_maps = 1000; - constexpr std::size_t elements_in_map = 500; - constexpr std::size_t map_capacity = 2 * elements_in_map; - - // one array for all maps, first elements_in_map element belong to map 0, second to map 1 and so - // on - std::vector h_keys(number_of_maps * elements_in_map); - std::vector h_values(number_of_maps * elements_in_map); - std::vector> h_pairs(number_of_maps * elements_in_map); - - // using std::unique_ptr because static_map does not have copy/move constructor/assignment - // operator yet - std::vector> maps; - - for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { - const std::size_t offset = map_id * elements_in_map; - - generate_keys(h_keys.begin() + offset, - h_keys.begin() + offset + elements_in_map); - - for (std::size_t i = 0; i < elements_in_map; ++i) { - KeyType key = h_keys[offset + i]; - ValueType val = key < std::numeric_limits::max() ? key + 1 : 0; - h_values[offset + i] = val; - h_pairs[offset + i].first = key; - h_pairs[offset + i].second = val; - } - - maps.push_back(std::make_unique(map_capacity, -1, -1)); - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector d_values(h_values); - thrust::device_vector> d_pairs(h_pairs); - - SECTION("Keys are all found after insertion.") - { - std::vector h_device_views; - for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { - const std::size_t offset = map_id * elements_in_map; - - MapType* map = maps[map_id].get(); - map->insert(d_pairs.begin() + offset, d_pairs.begin() + offset + elements_in_map); - h_device_views.push_back(map->get_device_view()); - } - thrust::device_vector d_device_views(h_device_views); - - thrust::device_vector d_keys_exist(number_of_maps * elements_in_map); - thrust::device_vector d_keys_and_values_correct(number_of_maps * elements_in_map); - - shared_memory_test_kernel - <<>>(d_device_views.data().get(), - d_keys.data().get(), - d_values.data().get(), - elements_in_map, - d_keys_exist.data().get(), - d_keys_and_values_correct.data().get()); - - REQUIRE(d_keys_exist.size() == d_keys_and_values_correct.size()); - auto zip = thrust::make_zip_iterator( - thrust::make_tuple(d_keys_exist.begin(), d_keys_and_values_correct.begin())); - - REQUIRE(all_of(zip, zip + d_keys_exist.size(), [] __device__(auto const& z) { - return thrust::get<0>(z) and thrust::get<1>(z); - })); - } - - SECTION("No key is found before insertion.") - { - std::vector h_device_views; - for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { - h_device_views.push_back(maps[map_id].get()->get_device_view()); - } - thrust::device_vector d_device_views(h_device_views); - - thrust::device_vector d_keys_exist(number_of_maps * elements_in_map); - thrust::device_vector d_keys_and_values_correct(number_of_maps * elements_in_map); - - shared_memory_test_kernel - <<>>(d_device_views.data().get(), - d_keys.data().get(), - d_values.data().get(), - elements_in_map, - d_keys_exist.data().get(), - d_keys_and_values_correct.data().get()); - - REQUIRE(none_of(d_keys_exist.begin(), d_keys_exist.end(), [] __device__(const bool key_found) { - return key_found; - })); - } -} - -template -__global__ void shared_memory_hash_table_kernel(bool* key_found) -{ - namespace cg = cooperative_groups; - using map_type = typename cuco::static_map::device_mutable_view; - using find_map_type = typename cuco::static_map::device_view; - __shared__ typename map_type::slot_type slots[N]; - auto map = map_type::make_from_uninitialized_slots(cg::this_thread_block(), &slots[0], N, -1, -1); - - auto g = cg::this_thread_block(); - std::size_t index = threadIdx.x + blockIdx.x * blockDim.x; - int rank = g.thread_rank(); - - // insert {thread_rank, thread_rank} for each thread in thread-block - map.insert(cuco::pair(rank, rank)); - g.sync(); - - auto find_map = find_map_type(map); - auto retrieved_pair = find_map.find(rank); - if (retrieved_pair != find_map.end() && retrieved_pair->second == rank) { - key_found[index] = true; - } -} - -TEMPLATE_TEST_CASE("Shared memory slots.", "", int32_t) -{ - constexpr std::size_t N = 256; - thrust::device_vector key_found(N, false); - shared_memory_hash_table_kernel<<<8, 32>>>(key_found.data().get()); - - REQUIRE(all_of(key_found.begin(), key_found.end(), thrust::identity{})); -} diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu new file mode 100644 index 000000000..624020d5a --- /dev/null +++ b/tests/static_map/unique_sequence_test.cu @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", + "", + ((typename Key, typename Value), Key, Value), + (int32_t, int32_t), + (int32_t, int64_t), + (int64_t, int32_t), + (int64_t, int64_t)) +{ + constexpr std::size_t num_keys{500'000}; + cuco::static_map map{1'000'000, -1, -1}; + + auto m_view = map.get_device_mutable_view(); + auto view = map.get_device_view(); + + thrust::device_vector d_keys(num_keys); + thrust::device_vector d_values(num_keys); + thrust::device_vector> d_pairs(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + thrust::sequence(thrust::device, d_values.begin(), d_values.end()); + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_keys), + d_pairs.begin(), + [] __device__(auto i) { + return cuco::pair_type{i, i}; + }); + + thrust::device_vector d_results(num_keys); + thrust::device_vector d_contained(num_keys); + + // bulk function test cases + SECTION("All inserted keys-value pairs should be correctly recovered during find") + { + map.insert(d_pairs.begin(), d_pairs.end()); + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); + + REQUIRE(all_of(zip, zip + num_keys, [] __device__(auto const& p) { + return thrust::get<0>(p) == thrust::get<1>(p); + })); + } + + SECTION("All inserted keys-value pairs should be contained") + { + map.insert(d_pairs.begin(), d_pairs.end()); + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); + + REQUIRE( + all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + } + + SECTION("Non-inserted keys-value pairs should not be contained") + { + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); + + REQUIRE( + none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + } + + SECTION("Inserting unique keys should return insert success.") + { + REQUIRE(all_of(d_pairs.begin(), + d_pairs.end(), + [m_view] __device__(cuco::pair_type const& pair) mutable { + return m_view.insert(pair); + })); + } + + SECTION("Cannot find any key in an empty hash map with non-const view") + { + SECTION("non-const view") + { + REQUIRE(all_of(d_pairs.begin(), + d_pairs.end(), + [view] __device__(cuco::pair_type const& pair) mutable { + return view.find(pair.first) == view.end(); + })); + } + SECTION("const view") + { + REQUIRE(all_of( + d_pairs.begin(), d_pairs.end(), [view] __device__(cuco::pair_type const& pair) { + return view.find(pair.first) == view.end(); + })); + } + } + + SECTION("Keys are all found after inserting many keys.") + { + // Bulk insert keys + thrust::for_each(thrust::device, + d_pairs.begin(), + d_pairs.end(), + [m_view] __device__(cuco::pair_type const& pair) mutable { + m_view.insert(pair); + }); + + SECTION("non-const view") + { + // All keys should be found + REQUIRE(all_of(d_pairs.begin(), + d_pairs.end(), + [view] __device__(cuco::pair_type const& pair) mutable { + auto const found = view.find(pair.first); + return (found != view.end()) and (found->first.load() == pair.first and + found->second.load() == pair.second); + })); + } + SECTION("const view") + { + // All keys should be found + REQUIRE(all_of( + d_pairs.begin(), d_pairs.end(), [view] __device__(cuco::pair_type const& pair) { + auto const found = view.find(pair.first); + return (found != view.end()) and + (found->first.load() == pair.first and found->second.load() == pair.second); + })); + } + } +} From f2dd037f9251ae998468c9b4be3e57468c1f069f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 1 Oct 2021 18:05:45 -0400 Subject: [PATCH 12/70] Correction: custom type example based on runtime capability check --- examples/static_map/custom_type.cu | 51 ++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/examples/static_map/custom_type.cu b/examples/static_map/custom_type.cu index df4e8c647..90d9dfb5f 100644 --- a/examples/static_map/custom_type.cu +++ b/examples/static_map/custom_type.cu @@ -21,38 +21,36 @@ #include // User-defined key type -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) // Manual alignment required due to WAR libcu++ bug where cuda::atomic fails for underaligned types -struct alignas(8) custom_key_type { +struct alignas(8) key_pair_type { int32_t a; int32_t b; - __host__ __device__ custom_key_type() {} - __host__ __device__ custom_key_type(int32_t x) : a{x}, b{x} {} + __host__ __device__ key_pair_type() {} + __host__ __device__ key_pair_type(int32_t x) : a{x}, b{x} {} // Device equality operator is mandatory - __device__ bool operator==(custom_key_type const& other) const + __device__ bool operator==(key_pair_type const& other) const { return a == other.a and b == other.b; } }; -#else + // Key type larger than 8B only supported for sm_70 and up -struct custom_key_type { +struct key_triplet_type { int32_t a; int32_t b; int32_t c; - __host__ __device__ custom_key_type() {} - __host__ __device__ custom_key_type(int32_t x) : a{x}, b{x}, c{x} {} + __host__ __device__ key_triplet_type() {} + __host__ __device__ key_triplet_type(int32_t x) : a{x}, b{x}, c{x} {} // Device equality operator is mandatory - __device__ bool operator==(custom_key_type const& other) const + __device__ bool operator==(key_triplet_type const& other) const { return a == other.a and b == other.b and c == other.c; } }; -#endif // User-defined value type // Manual alignment required due to WAR libcu++ bug where cuda::atomic fails for underaligned types @@ -66,18 +64,24 @@ struct alignas(8) custom_value_type { // User-defined device hash callable struct custom_hash { - __device__ uint32_t operator()(custom_key_type k) { return k.a; }; + template + __device__ uint32_t operator()(key_type k) + { + return k.a; + }; }; // User-defined device key equal callable struct custom_key_equals { - __device__ bool operator()(custom_key_type const& lhs, custom_key_type const& rhs) + template + __device__ bool operator()(key_type const& lhs, key_type const& rhs) { return lhs.a == rhs.a; } }; -int main(void) +template +void run_example() { constexpr std::size_t num_pairs = 80'000; @@ -118,6 +122,25 @@ int main(void) // All inserted keys are contained assert( thrust::all_of(contained.begin(), contained.end(), [] __device__(auto const& b) { return b; })); +} + +int main(void) +{ + constexpr int volta_major_number = 7; + + // Retrieve major compute capability version number + int dev_id, cap_major; + cudaGetDevice(&dev_id); + cudaDeviceGetAttribute(&cap_major, cudaDevAttrComputeCapabilityMajor, dev_id); + + // Run 8B-key example on Pascal + if (cap_major < volta_major_number) { + run_example(); + } + // 12B-key example on sm_70 and up + else { + run_example(); + } return 0; } From 890073080164cd4b20c66f206ac3735ebc41cfee Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 1 Oct 2021 18:20:43 -0400 Subject: [PATCH 13/70] Revert back to use custom logical functions + runtime capability check in unit tests --- tests/static_map/static_map_test.cu | 36 +++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index d3f69271e..5d2faa5de 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include @@ -29,8 +28,30 @@ // for a function-scope static __shared__ variable within a __device__/__global__ function" #pragma diag_suppress static_var_with_dynamic_init +namespace { namespace cg = cooperative_groups; +// User-defined logical algorithms to reduce compilation time +template +bool all_of(Iterator begin, Iterator end, Predicate p) +{ + auto size = thrust::distance(begin, end); + return size == thrust::count_if(begin, end, p); +} + +template +bool any_of(Iterator begin, Iterator end, Predicate p) +{ + return thrust::count_if(begin, end, p) > 0; +} + +template +bool none_of(Iterator begin, Iterator end, Predicate p) +{ + return not all_of(begin, end, p); +} +} // namespace + enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; template @@ -138,14 +159,21 @@ struct custom_equals { TEMPLATE_TEST_CASE_SIG("User defined key and value type", "", ((typename Key, typename Value), Key, Value), -// A key/value type larger than 8B is supported only for sm_70 and up -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) (key_pair_type, value_pair_type), (key_pair_type, value_pair_type), (large_key_type, value_pair_type), -#endif (key_pair_type, value_pair_type)) { + constexpr int volta_major_number = 7; + + // Retrieve major compute capability version number + int dev_id, cap_major; + cudaGetDevice(&dev_id); + cudaDeviceGetAttribute(&cap_major, cudaDevAttrComputeCapabilityMajor, dev_id); + + // Key type larger than 8B only supported for sm_70 and up + if (sizeof(Key) > 8 and cap_major < volta_major_number) { return; } + auto const sentinel_key = Key{-1}; auto const sentinel_value = Value{-1}; From 2d7826dd11ccca29d54cb6142c016482b3a29a68 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 4 Oct 2021 10:34:23 -0400 Subject: [PATCH 14/70] Use BUILD_PASCAL_CODE macro in tests and example --- CMakeLists.txt | 7 +++++ examples/static_map/custom_type.cu | 42 +++++++++-------------------- tests/static_map/static_map_test.cu | 12 ++------- 3 files changed, 21 insertions(+), 40 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6ba819218..f2db1643d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,13 @@ if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}") set(default_build_option_state ON) endif() +foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + if("${arch}" MATCHES "60") + add_compile_definitions(BUILD_PASCAL_CODE) + break() + endif() +endforeach() + option(BUILD_TESTS "Configure CMake to build tests" ${default_build_option_state}) option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" ${default_build_option_state}) option(BUILD_EXAMPLES "Configure CMake to build examples" ${default_build_option_state}) diff --git a/examples/static_map/custom_type.cu b/examples/static_map/custom_type.cu index 90d9dfb5f..c4dab460f 100644 --- a/examples/static_map/custom_type.cu +++ b/examples/static_map/custom_type.cu @@ -21,36 +21,38 @@ #include // User-defined key type +#ifdef BUILD_PASCAL_CODE // Manual alignment required due to WAR libcu++ bug where cuda::atomic fails for underaligned types -struct alignas(8) key_pair_type { +struct alignas(8) custom_key_type { int32_t a; int32_t b; - __host__ __device__ key_pair_type() {} - __host__ __device__ key_pair_type(int32_t x) : a{x}, b{x} {} + __host__ __device__ custom_key_type() {} + __host__ __device__ custom_key_type(int32_t x) : a{x}, b{x} {} // Device equality operator is mandatory - __device__ bool operator==(key_pair_type const& other) const + __device__ bool operator==(custom_key_type const& other) const { return a == other.a and b == other.b; } }; - +#else // Key type larger than 8B only supported for sm_70 and up -struct key_triplet_type { +struct custom_key_type { int32_t a; int32_t b; int32_t c; - __host__ __device__ key_triplet_type() {} - __host__ __device__ key_triplet_type(int32_t x) : a{x}, b{x}, c{x} {} + __host__ __device__ custom_key_type() {} + __host__ __device__ custom_key_type(int32_t x) : a{x}, b{x}, c{x} {} // Device equality operator is mandatory - __device__ bool operator==(key_triplet_type const& other) const + __device__ bool operator==(custom_key_type const& other) const { return a == other.a and b == other.b and c == other.c; } }; +#endif // User-defined value type // Manual alignment required due to WAR libcu++ bug where cuda::atomic fails for underaligned types @@ -80,8 +82,7 @@ struct custom_key_equals { } }; -template -void run_example() +int main(void) { constexpr std::size_t num_pairs = 80'000; @@ -122,25 +123,6 @@ void run_example() // All inserted keys are contained assert( thrust::all_of(contained.begin(), contained.end(), [] __device__(auto const& b) { return b; })); -} - -int main(void) -{ - constexpr int volta_major_number = 7; - - // Retrieve major compute capability version number - int dev_id, cap_major; - cudaGetDevice(&dev_id); - cudaDeviceGetAttribute(&cap_major, cudaDevAttrComputeCapabilityMajor, dev_id); - - // Run 8B-key example on Pascal - if (cap_major < volta_major_number) { - run_example(); - } - // 12B-key example on sm_70 and up - else { - run_example(); - } return 0; } diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index 5d2faa5de..7acf5f2e2 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -159,21 +159,13 @@ struct custom_equals { TEMPLATE_TEST_CASE_SIG("User defined key and value type", "", ((typename Key, typename Value), Key, Value), +#ifndef BUILD_PASCAL_CODE // Key type larger than 8B only supported for sm_70 and up (key_pair_type, value_pair_type), (key_pair_type, value_pair_type), (large_key_type, value_pair_type), +#endif (key_pair_type, value_pair_type)) { - constexpr int volta_major_number = 7; - - // Retrieve major compute capability version number - int dev_id, cap_major; - cudaGetDevice(&dev_id); - cudaDeviceGetAttribute(&cap_major, cudaDevAttrComputeCapabilityMajor, dev_id); - - // Key type larger than 8B only supported for sm_70 and up - if (sizeof(Key) > 8 and cap_major < volta_major_number) { return; } - auto const sentinel_key = Key{-1}; auto const sentinel_value = Value{-1}; From bc2558c177123875c35e6cd2eb9930bfc3a4d722 Mon Sep 17 00:00:00 2001 From: Chirayu Date: Mon, 1 Nov 2021 11:39:01 -0700 Subject: [PATCH 15/70] Add plumbing for execution stream for static_map functions --- include/cuco/detail/static_map.inl | 50 ++++++++++++++++++++---------- include/cuco/static_map.cuh | 13 +++++--- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index da5d88db7..0826ff30d 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -22,26 +22,33 @@ template ::static_map(std::size_t capacity, Key empty_key_sentinel, Value empty_value_sentinel, - Allocator const& alloc) + Allocator const& alloc, + cudaStream_t stream) : capacity_{std::max(capacity, std::size_t{1})}, // to avoid dereferencing a nullptr (Issue #72) empty_key_sentinel_{empty_key_sentinel}, empty_value_sentinel_{empty_value_sentinel}, slot_allocator_{alloc}, - counter_allocator_{alloc} + counter_allocator_{alloc}, + exec_stream_{stream} { - slots_ = std::allocator_traits::allocate(slot_allocator_, capacity_); - num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); + // allocator should allocate memory accessible by the exec_stream_ + slots_ = std::allocator_traits::allocate(slot_allocator_, capacity_, + exec_stream_); + num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1, + exec_stream_); auto constexpr block_size = 256; auto constexpr stride = 4; auto const grid_size = (capacity_ + stride * block_size - 1) / (stride * block_size); detail::initialize - <<>>(slots_, empty_key_sentinel, empty_value_sentinel, capacity_); + <<>>(slots_, empty_key_sentinel, empty_value_sentinel, + capacity_); } template static_map::~static_map() { + // use exec_stream_ parameter param std::allocator_traits::deallocate(slot_allocator_, slots_, capacity_); std::allocator_traits::deallocate(counter_allocator_, num_successes_, 1); } @@ -51,7 +58,8 @@ template void static_map::insert(InputIt first, InputIt last, Hash hash, - KeyEqual key_equal) + KeyEqual key_equal, + cudaStream_t stream) { auto num_keys = std::distance(first, last); if (num_keys == 0) { return; } @@ -64,14 +72,18 @@ void static_map::insert(InputIt first, // TODO: memset an atomic variable is unsafe static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); - CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type))); + CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream)); std::size_t h_num_successes; detail::insert - <<>>(first, first + num_keys, num_successes_, view, hash, key_equal); + <<>>(first, first + num_keys, num_successes_, + view, hash, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( - &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); + &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, + stream)); + // stream'd execution assumes sync not required + if (stream == NULL) + CUCO_CUDA_TRY(cudaDeviceSynchronize()); // ensures legacy behavior size_ += h_num_successes; } @@ -79,7 +91,8 @@ void static_map::insert(InputIt first, template template void static_map::find( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) + InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal, + cudaStream_t stream) { auto num_keys = std::distance(first, last); if (num_keys == 0) { return; } @@ -91,14 +104,17 @@ void static_map::find( auto view = get_device_view(); detail::find - <<>>(first, last, output_begin, view, hash, key_equal); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); + <<>>(first, last, output_begin, view, hash, key_equal); + // stream'd execution assumes sync not required + if (stream == NULL) + CUCO_CUDA_TRY(cudaDeviceSynchronize()); // ensures legacy behavior } template template void static_map::contains( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) + InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal, + cudaStream_t stream) { auto num_keys = std::distance(first, last); if (num_keys == 0) { return; } @@ -110,8 +126,10 @@ void static_map::contains( auto view = get_device_view(); detail::contains - <<>>(first, last, output_begin, view, hash, key_equal); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); + <<>>(first, last, output_begin, view, hash, key_equal); + // stream'd execution assumes sync not required + if (stream == NULL) + CUCO_CUDA_TRY(cudaDeviceSynchronize()); // ensures legacy behavior } template diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 582549de3..a9bd1a207 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -193,7 +193,8 @@ class static_map { static_map(std::size_t capacity, Key empty_key_sentinel, Value empty_value_sentinel, - Allocator const& alloc = Allocator{}); + Allocator const& alloc = Allocator{}, + cudaStream_t stream = NULL); /** * @brief Destroys the map and frees its contents. @@ -219,7 +220,8 @@ class static_map { template , typename KeyEqual = thrust::equal_to> - void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); + void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = NULL); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. @@ -247,7 +249,8 @@ class static_map { InputIt last, OutputIt output_begin, Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}); + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = NULL); /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. @@ -274,7 +277,8 @@ class static_map { InputIt last, OutputIt output_begin, Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}); + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = NULL); private: class device_view_base { @@ -1056,6 +1060,7 @@ class static_map { atomic_ctr_type* num_successes_{}; ///< Number of successfully inserted keys on insert slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate `num_successes_` + cudaStream_t exec_stream_{}; ///< Cuda stream for allocator and execution }; } // namespace cuco From b8c8e6c065edccae610bf9e314d9deb28f28b041 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 1 Nov 2021 15:45:54 -0400 Subject: [PATCH 16/70] Use transform iterator instead of materializing the inputs --- examples/static_map/custom_type.cu | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/examples/static_map/custom_type.cu b/examples/static_map/custom_type.cu index c4dab460f..a4c39abd3 100644 --- a/examples/static_map/custom_type.cu +++ b/examples/static_map/custom_type.cu @@ -90,13 +90,10 @@ int main(void) auto const empty_key_sentinel = custom_key_type{-1}; auto const empty_value_sentinel = custom_value_type{-1}; - thrust::device_vector> pairs(num_pairs); // Create a sequence of 80'000 pairs - thrust::transform( - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_pairs), - pairs.begin(), - [] __device__(auto i) { return thrust::make_pair(custom_key_type{i}, custom_value_type{i}); }); + auto pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::make_pair(custom_key_type{i}, custom_value_type{i}); }); // Construct a map with 100,000 slots using the given empty key/value sentinels. Note the // capacity is chosen knowing we will insert 80,000 keys, for an load factor of 80%. @@ -104,7 +101,7 @@ int main(void) 100'000, empty_key_sentinel, empty_value_sentinel}; // Inserts all pairs into the map by using the custom hasher and custom equality callable - map.insert(pairs.begin(), pairs.end(), custom_hash{}, custom_key_equals{}); + map.insert(pairs_begin, pairs_begin + num_pairs, custom_hash{}, custom_key_equals{}); // Reproduce inserted keys auto insert_keys = From 5be1e109bfeab83773afb6a9fe20344d20284403 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 1 Nov 2021 17:02:40 -0400 Subject: [PATCH 17/70] Fetch the latest libcudacxx + remove unnecessary alignment --- cmake/thirdparty/get_libcudacxx.cmake | 6 +++--- examples/static_map/custom_type.cu | 15 ++++----------- tests/static_map/static_map_test.cu | 10 ++++++---- tests/static_multimap/static_multimap_test.cu | 6 ++---- 4 files changed, 15 insertions(+), 22 deletions(-) diff --git a/cmake/thirdparty/get_libcudacxx.cmake b/cmake/thirdparty/get_libcudacxx.cmake index 23328dd5b..1b591326c 100644 --- a/cmake/thirdparty/get_libcudacxx.cmake +++ b/cmake/thirdparty/get_libcudacxx.cmake @@ -12,13 +12,13 @@ # the License. # ============================================================================= -# Use CPM to find or clone thrust +# Use CPM to find or clone libcudacxx function(find_and_configure_libcudacxx VERSION) rapids_cpm_find( libcudacxx ${VERSION} CPM_ARGS GIT_REPOSITORY https://github.com/NVIDIA/libcudacxx.git - GIT_TAG ${VERSION} + GIT_TAG ${VERSION}-ea GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE ) @@ -31,4 +31,4 @@ function(find_and_configure_libcudacxx VERSION) install(DIRECTORY ${libcudacxx_SOURCE_DIR}/libcxx/include/ DESTINATION include/cuco/libcxx/include) endfunction() -find_and_configure_libcudacxx(1.4.0) +find_and_configure_libcudacxx(1.7.0) # CMake find_package takes number only diff --git a/examples/static_map/custom_type.cu b/examples/static_map/custom_type.cu index a4c39abd3..5581e6c79 100644 --- a/examples/static_map/custom_type.cu +++ b/examples/static_map/custom_type.cu @@ -22,19 +22,12 @@ // User-defined key type #ifdef BUILD_PASCAL_CODE -// Manual alignment required due to WAR libcu++ bug where cuda::atomic fails for underaligned types -struct alignas(8) custom_key_type { +struct custom_key_type { int32_t a; int32_t b; __host__ __device__ custom_key_type() {} __host__ __device__ custom_key_type(int32_t x) : a{x}, b{x} {} - - // Device equality operator is mandatory - __device__ bool operator==(custom_key_type const& other) const - { - return a == other.a and b == other.b; - } }; #else // Key type larger than 8B only supported for sm_70 and up @@ -46,7 +39,8 @@ struct custom_key_type { __host__ __device__ custom_key_type() {} __host__ __device__ custom_key_type(int32_t x) : a{x}, b{x}, c{x} {} - // Device equality operator is mandatory + // Device equality operator is mandatory due to libcudacxx bug: + // https://github.com/NVIDIA/libcudacxx/issues/223 __device__ bool operator==(custom_key_type const& other) const { return a == other.a and b == other.b and c == other.c; @@ -55,8 +49,7 @@ struct custom_key_type { #endif // User-defined value type -// Manual alignment required due to WAR libcu++ bug where cuda::atomic fails for underaligned types -struct alignas(8) custom_value_type { +struct custom_value_type { int32_t f; int32_t s; diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index 7acf5f2e2..684b4fe60 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -83,16 +83,16 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) } // User-defined key type -// Need to specify alignment to WAR libcu++ bug where cuda::atomic fails for underaligned types: -// https://github.com/NVIDIA/libcudacxx/issues/160 template -struct alignas(8) key_pair_type { +struct key_pair_type { T a; T b; __host__ __device__ key_pair_type() {} __host__ __device__ key_pair_type(T x) : a{x}, b{x} {} + // Device equality operator is mandatory due to libcudacxx bug: + // https://github.com/NVIDIA/libcudacxx/issues/223 __device__ bool operator==(key_pair_type const& other) const { return a == other.a and b == other.b; @@ -109,6 +109,8 @@ struct large_key_type { __host__ __device__ large_key_type() {} __host__ __device__ large_key_type(T x) : a{x}, b{x}, c{x} {} + // Device equality operator is mandatory due to libcudacxx bug: + // https://github.com/NVIDIA/libcudacxx/issues/223 __device__ bool operator==(large_key_type const& other) const { return a == other.a and b == other.b and c == other.c; @@ -117,7 +119,7 @@ struct large_key_type { // User-defined value type template -struct alignas(8) value_pair_type { +struct value_pair_type { T f; T s; diff --git a/tests/static_multimap/static_multimap_test.cu b/tests/static_multimap/static_multimap_test.cu index 8191ab0d9..0140d3223 100644 --- a/tests/static_multimap/static_multimap_test.cu +++ b/tests/static_multimap/static_multimap_test.cu @@ -55,9 +55,7 @@ bool none_of(Iterator begin, Iterator end, Predicate p) enum class probe_sequence { linear_probing, double_hashing }; // User-defined key type -// Need to specify alignment to WAR libcu++ bug where cuda::atomic fails for underaligned types: -// https://github.com/NVIDIA/libcudacxx/issues/160 -struct alignas(8) key_pair { +struct key_pair { int32_t a; int32_t b; __device__ bool operator!=(key_pair const& other) const { return a != other.a and b != other.b; } @@ -74,7 +72,7 @@ struct key_pair_equals { } }; -struct alignas(8) value_pair { +struct value_pair { int32_t f; int32_t s; }; From 68227f6b05406849aa5abe510011e436bf9983cb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 1 Nov 2021 17:37:24 -0400 Subject: [PATCH 18/70] Define and use CUCO_NO_INDEPENDENT_THREADS macro in sub-level CMake files --- CMakeLists.txt | 7 ------- examples/CMakeLists.txt | 4 ++++ examples/static_map/custom_type.cu | 2 +- tests/CMakeLists.txt | 4 ++++ tests/static_map/static_map_test.cu | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b80ea8d6d..eb3f601fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,13 +80,6 @@ if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}") set(default_build_option_state ON) endif() -foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) - if("${arch}" MATCHES "60") - add_compile_definitions(BUILD_PASCAL_CODE) - break() - endif() -endforeach() - option(BUILD_TESTS "Configure CMake to build tests" ${default_build_option_state}) option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" ${default_build_option_state}) option(BUILD_EXAMPLES "Configure CMake to build examples" ${default_build_option_state}) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index ae7f724f5..7a32138bb 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,3 +20,7 @@ endfunction(ConfigureExample) ConfigureExample(STATIC_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_example.cu") ConfigureExample(CUSTOM_TYPE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type.cu") ConfigureExample(STATIC_MULTIMAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/static_multimap_example.cu") + +if("60" IN_LIST CMAKE_CUDA_ARCHITECTURES) + target_compile_definitions(CUSTOM_TYPE PRIVATE CUCO_NO_INDEPENDENT_THREADS) +endif() diff --git a/examples/static_map/custom_type.cu b/examples/static_map/custom_type.cu index 5581e6c79..33b30e578 100644 --- a/examples/static_map/custom_type.cu +++ b/examples/static_map/custom_type.cu @@ -21,7 +21,7 @@ #include // User-defined key type -#ifdef BUILD_PASCAL_CODE +#ifdef CUCO_NO_INDEPENDENT_THREADS struct custom_key_type { int32_t a; int32_t b; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 33bb1bfc4..405bb4b66 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -39,6 +39,10 @@ set(STATIC_MAP_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_test.cu") ConfigureTest(STATIC_MAP_TEST "${STATIC_MAP_TEST_SRC}") + +if("60" IN_LIST CMAKE_CUDA_ARCHITECTURES) + target_compile_definitions(STATIC_MAP_TEST PRIVATE CUCO_NO_INDEPENDENT_THREADS) +endif() #################################################################################################### set(DYNAMIC_MAP_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/dynamic_map/dynamic_map_test.cu") diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index 684b4fe60..46155d3d9 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -161,7 +161,7 @@ struct custom_equals { TEMPLATE_TEST_CASE_SIG("User defined key and value type", "", ((typename Key, typename Value), Key, Value), -#ifndef BUILD_PASCAL_CODE // Key type larger than 8B only supported for sm_70 and up +#ifndef CUCO_NO_INDEPENDENT_THREADS // Key type larger than 8B only supported for sm_70 and up (key_pair_type, value_pair_type), (key_pair_type, value_pair_type), (large_key_type, value_pair_type), From a0570665a878f3b0aea04d1b5a4bf42aa5b3fd1d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 1 Nov 2021 19:56:36 -0400 Subject: [PATCH 19/70] Use MATCH instead of IN_LIST in CMake --- cmake/thirdparty/get_libcudacxx.cmake | 2 +- examples/CMakeLists.txt | 9 ++++++--- tests/CMakeLists.txt | 9 ++++++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/cmake/thirdparty/get_libcudacxx.cmake b/cmake/thirdparty/get_libcudacxx.cmake index 1b591326c..b29b02a6e 100644 --- a/cmake/thirdparty/get_libcudacxx.cmake +++ b/cmake/thirdparty/get_libcudacxx.cmake @@ -31,4 +31,4 @@ function(find_and_configure_libcudacxx VERSION) install(DIRECTORY ${libcudacxx_SOURCE_DIR}/libcxx/include/ DESTINATION include/cuco/libcxx/include) endfunction() -find_and_configure_libcudacxx(1.7.0) # CMake find_package takes number only +find_and_configure_libcudacxx(1.7.0) # CMake find_package takes numbers only diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 7a32138bb..3788c764c 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,6 +21,9 @@ ConfigureExample(STATIC_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/stat ConfigureExample(CUSTOM_TYPE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type.cu") ConfigureExample(STATIC_MULTIMAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/static_multimap_example.cu") -if("60" IN_LIST CMAKE_CUDA_ARCHITECTURES) - target_compile_definitions(CUSTOM_TYPE PRIVATE CUCO_NO_INDEPENDENT_THREADS) -endif() +foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + if("${arch}" MATCHES "60") + target_compile_definitions(CUSTOM_TYPE PRIVATE CUCO_NO_INDEPENDENT_THREADS) + break() + endif() +endforeach() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 405bb4b66..4f02b69de 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -40,9 +40,12 @@ set(STATIC_MAP_TEST_SRC ConfigureTest(STATIC_MAP_TEST "${STATIC_MAP_TEST_SRC}") -if("60" IN_LIST CMAKE_CUDA_ARCHITECTURES) - target_compile_definitions(STATIC_MAP_TEST PRIVATE CUCO_NO_INDEPENDENT_THREADS) -endif() +foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + if("${arch}" MATCHES "60") + target_compile_definitions(STATIC_MAP_TEST PRIVATE CUCO_NO_INDEPENDENT_THREADS) + break() + endif() +endforeach() #################################################################################################### set(DYNAMIC_MAP_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/dynamic_map/dynamic_map_test.cu") From bf6a90db78516e099d07e845a39012dbcaa8de18 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 2 Nov 2021 14:53:28 -0400 Subject: [PATCH 20/70] Use rapids_cpm_libcudacxx to support installed libcudacxx Currently if an installed version of libcudacxx is found, the install paths computed in get_libcudacxx are wrong. Resolve this issue by instead using rapids-cmake --- CMakeLists.txt | 8 ++++---- cmake/thirdparty/get_libcudacxx.cmake | 24 +++++++----------------- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eb3f601fd..a5913d156 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,7 @@ add_library(cuco::cuco ALIAS cuco) target_include_directories(cuco INTERFACE INTERFACE $ $) -target_link_libraries(cuco INTERFACE libcudacxx CUDA::toolkit $) +target_link_libraries(cuco INTERFACE CUDA::toolkit $) target_compile_features(cuco INTERFACE cxx_std_17 cuda_std_17) ################################################################################################### @@ -108,7 +108,7 @@ endif(BUILD_EXAMPLES) ################################################################################################### # - Install targets ------------------------------------------------------------------------------- -install(TARGETS cuco libcudacxx EXPORT cuco-exports) +install(TARGETS cuco EXPORT cuco-exports) install(DIRECTORY include/cuco/ DESTINATION include/cuco) install(FILES ${CUCO_BINARY_DIR}/include/cuco/version_config.hpp DESTINATION include/cuco) @@ -131,7 +131,7 @@ structures tailored for efficient use with GPUs. rapids_export( INSTALL cuco EXPORT_SET cuco-exports - GLOBAL_TARGETS cuco libcudacxx + GLOBAL_TARGETS cuco NAMESPACE cuco:: DOCUMENTATION doc_string) @@ -146,7 +146,7 @@ endif() rapids_export( BUILD cuco EXPORT_SET cuco-exports - GLOBAL_TARGETS cuco libcudacxx + GLOBAL_TARGETS cuco NAMESPACE cuco:: DOCUMENTATION doc_string FINAL_CODE_BLOCK code_string) diff --git a/cmake/thirdparty/get_libcudacxx.cmake b/cmake/thirdparty/get_libcudacxx.cmake index 23328dd5b..48f3dfecb 100644 --- a/cmake/thirdparty/get_libcudacxx.cmake +++ b/cmake/thirdparty/get_libcudacxx.cmake @@ -13,22 +13,12 @@ # ============================================================================= # Use CPM to find or clone thrust -function(find_and_configure_libcudacxx VERSION) - rapids_cpm_find( - libcudacxx ${VERSION} - CPM_ARGS - GIT_REPOSITORY https://github.com/NVIDIA/libcudacxx.git - GIT_TAG ${VERSION} - GIT_SHALLOW TRUE - DOWNLOAD_ONLY TRUE - ) - # TODO: Once libcu++ exports a target, use that instead - add_library(libcudacxx INTERFACE) - target_include_directories(libcudacxx - INTERFACE $ - $) - install(DIRECTORY ${libcudacxx_SOURCE_DIR}/include/ DESTINATION include/cuco/libcudacxx) - install(DIRECTORY ${libcudacxx_SOURCE_DIR}/libcxx/include/ DESTINATION include/cuco/libcxx/include) +function(find_and_configure_libcudacxx) + include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) + + rapids_cpm_libcudacxx(BUILD_EXPORT_SET cuco-exports + INSTALL_EXPORT_SET cuco-exports) + endfunction() -find_and_configure_libcudacxx(1.4.0) +find_and_configure_libcudacxx() From 1fee0e7d50f8e7674993e298f045b130aedd8413 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 2 Nov 2021 16:57:11 -0400 Subject: [PATCH 21/70] Make sure we specify libcudacxx include paths --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a5913d156..956241ea6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,7 @@ add_library(cuco::cuco ALIAS cuco) target_include_directories(cuco INTERFACE INTERFACE $ $) -target_link_libraries(cuco INTERFACE CUDA::toolkit $) +target_link_libraries(cuco INTERFACE libcudacxx::libcudacxx CUDA::toolkit $) target_compile_features(cuco INTERFACE cxx_std_17 cuda_std_17) ################################################################################################### From 062ca04671980a5b299ffaffc7e3f151b8c6a5bd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 3 Nov 2021 20:30:48 -0400 Subject: [PATCH 22/70] Add key_sentinel_test --- tests/static_map/key_sentinel_test.cu | 84 +++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 tests/static_map/key_sentinel_test.cu diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu new file mode 100644 index 000000000..40f130c22 --- /dev/null +++ b/tests/static_map/key_sentinel_test.cu @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include + +#include + +#define SIZE 10 +__device__ int A[SIZE]; + +template +struct custom_equals { + __device__ bool operator()(T lhs, T rhs) { return A[lhs] == A[rhs]; } +}; + +TEMPLATE_TEST_CASE_SIG( + "Key comparison against sentinel", "", ((typename T), T), (int32_t), (int64_t)) +{ + using Key = T; + using Value = T; + + constexpr std::size_t num_keys{SIZE}; + cuco::static_map map{SIZE * 2, -1, -1}; + + auto m_view = map.get_device_mutable_view(); + auto view = map.get_device_view(); + + int h_A[SIZE]; + for (int i = 0; i < SIZE; i++) { + h_A[i] = i; + } + cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int)); + + auto pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair_type(i, i); }); + + SECTION( + "Tests of non-CG insert: The custom `key_equal` can never be used to compare against sentinel") + { + REQUIRE(all_of(pairs_begin, + pairs_begin + num_keys, + [m_view] __device__(cuco::pair_type const& pair) mutable { + return m_view.insert( + pair, cuco::detail::MurmurHash3_32{}, custom_equals{}); + })); + } + + SECTION( + "Tests of CG insert: The custom `key_equal` can never be used to compare against sentinel") + { + map.insert(pairs_begin, + pairs_begin + num_keys, + cuco::detail::MurmurHash3_32{}, + custom_equals{}); + // All keys inserted via custom `key_equal` should be found + REQUIRE(all_of(pairs_begin, + pairs_begin + num_keys, + [view] __device__(cuco::pair_type const& pair) { + auto const found = view.find(pair.first); + return (found != view.end()) and (found->first.load() == pair.first and + found->second.load() == pair.second); + })); + } +} From f9bbbf9a94b648891814b978a2b1fc8662e8d818 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 3 Nov 2021 20:44:15 -0400 Subject: [PATCH 23/70] Split multimap tests into multiple files --- tests/CMakeLists.txt | 6 +- tests/static_multimap/custom_type_test.cu | 236 ++++++ tests/static_multimap/insert_if_test.cu | 80 ++ tests/static_multimap/multiplicity_test.cu | 178 +++++ tests/static_multimap/non_match_test.cu | 152 ++++ tests/static_multimap/pair_function_test.cu | 126 ++++ tests/static_multimap/static_multimap_test.cu | 701 ------------------ tests/util.hpp | 2 + 8 files changed, 779 insertions(+), 702 deletions(-) create mode 100644 tests/static_multimap/custom_type_test.cu create mode 100644 tests/static_multimap/insert_if_test.cu create mode 100644 tests/static_multimap/multiplicity_test.cu create mode 100644 tests/static_multimap/non_match_test.cu create mode 100644 tests/static_multimap/pair_function_test.cu delete mode 100644 tests/static_multimap/static_multimap_test.cu diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 675c0d3a6..8923bf56a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -67,7 +67,11 @@ ConfigureTest(DYNAMIC_MAP_TEST ################################################################################################### # - static_multimap tests ------------------------------------------------------------------------- ConfigureTest(STATIC_MULTIMAP_TEST - static_multimap/static_multimap_test.cu) + static_multimap/custom_type_test.cu + static_multimap/insert_if_test.cu + static_multimap/multiplicity_test.cu + static_multimap/non_match_test.cu + static_multimap/pair_function_test.cu) ################################################################################################### foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) diff --git a/tests/static_multimap/custom_type_test.cu b/tests/static_multimap/custom_type_test.cu new file mode 100644 index 000000000..d54ac7f08 --- /dev/null +++ b/tests/static_multimap/custom_type_test.cu @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include + +#include + +// User-defined key type +struct key_pair { + int32_t a; + int32_t b; + __device__ bool operator!=(key_pair const& other) const { return a != other.a and b != other.b; } +}; + +struct hash_key_pair { + __device__ uint32_t operator()(key_pair k) { return k.a; }; +}; + +struct key_pair_equals { + __device__ bool operator()(const key_pair& lhs, const key_pair& rhs) + { + return std::tie(lhs.a, lhs.b) == std::tie(rhs.a, rhs.b); + } +}; + +struct value_pair { + int32_t f; + int32_t s; +}; + +template +__inline__ void test_custom_key_value_type(Map& map, + PairIt pair_begin, + KeyIt key_begin, + size_t num_pairs) +{ + constexpr cudaStream_t stream = 0; + + SECTION("All inserted keys-value pairs should be correctly recovered during find") + { + map.insert(pair_begin, pair_begin + num_pairs); + + auto res = map.get_size(); + REQUIRE(res == num_pairs); + + auto count = map.count(key_begin, key_begin + num_pairs, stream, key_pair_equals{}); + REQUIRE(count == num_pairs); + + thrust::device_vector> found_pairs(num_pairs); + auto output_end = map.retrieve( + key_begin, key_begin + num_pairs, found_pairs.begin(), stream, key_pair_equals{}); + auto size = output_end - found_pairs.begin(); + + REQUIRE(size == num_pairs); + + // sort before compare + thrust::sort( + thrust::device, + found_pairs.begin(), + found_pairs.end(), + [] __device__(const cuco::pair_type& lhs, + const cuco::pair_type& rhs) { return lhs.first.a < rhs.first.a; }); + + REQUIRE(thrust::equal( + thrust::device, + pair_begin, + pair_begin + num_pairs, + found_pairs.begin(), + [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { + return lhs.first.a == rhs.first.a; + })); + } + + SECTION("Non-matches are not included in the output") + { + map.insert(pair_begin, pair_begin + num_pairs); + + auto const num = num_pairs * 2; + thrust::device_vector query_keys(num); + auto query_key_begin = query_keys.begin(); + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num), + query_key_begin, + [] __device__(auto i) { + return Key{i, i}; + }); + + auto count = map.count(query_key_begin, query_key_begin + num, stream, key_pair_equals{}); + REQUIRE(count == num_pairs); + + thrust::device_vector> found_pairs(num_pairs); + auto output_end = map.retrieve( + query_key_begin, query_key_begin + num, found_pairs.begin(), stream, key_pair_equals{}); + auto size = output_end - found_pairs.begin(); + + REQUIRE(size == num_pairs); + + // sort before compare + thrust::sort( + thrust::device, + found_pairs.begin(), + found_pairs.end(), + [] __device__(const cuco::pair_type& lhs, + const cuco::pair_type& rhs) { return lhs.first.a < rhs.first.a; }); + + REQUIRE(thrust::equal( + thrust::device, + pair_begin, + pair_begin + num_pairs, + found_pairs.begin(), + [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { + return lhs.first.a == rhs.first.a; + })); + } + + SECTION("Outer functions include non-matches in the output") + { + map.insert(pair_begin, pair_begin + num_pairs); + + auto const num = num_pairs * 2; + thrust::device_vector query_keys(num); + auto query_key_begin = query_keys.begin(); + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num), + query_key_begin, + [] __device__(auto i) { + return Key{i, i}; + }); + + auto count_outer = + map.count_outer(query_key_begin, query_key_begin + num, stream, key_pair_equals{}); + REQUIRE(count_outer == num); + + thrust::device_vector> found_pairs(num); + auto output_end = map.retrieve_outer( + query_key_begin, query_key_begin + num, found_pairs.begin(), stream, key_pair_equals{}); + auto size_outer = output_end - found_pairs.begin(); + + REQUIRE(size_outer == num); + } + + SECTION("All inserted keys-value pairs should be contained") + { + map.insert(pair_begin, pair_begin + num_pairs); + + auto size = map.get_size(); + REQUIRE(size == num_pairs); + + thrust::device_vector contained(num_pairs); + map.contains(key_begin, key_begin + num_pairs, contained.begin(), stream, key_pair_equals{}); + REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool const& b) { return b; })); + } + + SECTION("Non-inserted keys-value pairs should not be contained") + { + auto size = map.get_size(); + REQUIRE(size == 0); + + thrust::device_vector contained(num_pairs); + map.contains(key_begin, key_begin + num_pairs, contained.begin(), stream, key_pair_equals{}); + REQUIRE( + none_of(contained.begin(), contained.end(), [] __device__(bool const& b) { return b; })); + } +} + +TEMPLATE_TEST_CASE_SIG("User defined key and value type", + "", + ((probe_sequence Probe), Probe), + (probe_sequence::linear_probing), + (probe_sequence::double_hashing)) +{ + using Key = key_pair; + using Value = value_pair; + + auto constexpr sentinel_key = Key{-1, -1}; + auto constexpr sentinel_value = Value{-1, -1}; + + constexpr std::size_t num_pairs = 100; + constexpr std::size_t capacity = num_pairs * 2; + + thrust::device_vector insert_keys(num_pairs); + thrust::device_vector insert_values(num_pairs); + + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_pairs), + insert_keys.begin(), + [] __device__(auto i) { + return Key{i, i}; + }); + + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_pairs), + insert_values.begin(), + [] __device__(auto i) { + return Value{i, i}; + }); + auto insert_pairs = + thrust::make_zip_iterator(thrust::make_tuple(insert_keys.begin(), insert_values.begin())); + + if constexpr (Probe == probe_sequence::linear_probing) { + cuco::static_multimap, + cuco::linear_probing<1, hash_key_pair>> + map{capacity, sentinel_key, sentinel_value}; + test_custom_key_value_type(map, insert_pairs, insert_keys.begin(), num_pairs); + } + if constexpr (Probe == probe_sequence::double_hashing) { + cuco::static_multimap map{capacity, sentinel_key, sentinel_value}; + test_custom_key_value_type(map, insert_pairs, insert_keys.begin(), num_pairs); + } +} diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu new file mode 100644 index 000000000..e8c8bff02 --- /dev/null +++ b/tests/static_multimap/insert_if_test.cu @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include + +#include + +template +__inline__ void test_insert_if(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t size) +{ + // 50% insertion + auto pred_lambda = [] __device__(Key k) { return k % 2 == 0; }; + + map.insert_if(pair_begin, pair_begin + size, key_begin, pred_lambda); + + auto res = map.get_size(); + REQUIRE(res * 2 == size); + + auto num = map.count(key_begin, key_begin + size); + REQUIRE(num * 2 == size); +} + +TEMPLATE_TEST_CASE_SIG("Tests of insert_if", + "", + ((typename Key, typename Value, probe_sequence Probe), Key, Value, Probe), + (int32_t, int32_t, probe_sequence::linear_probing), + (int32_t, int64_t, probe_sequence::linear_probing), + (int64_t, int64_t, probe_sequence::linear_probing), + (int32_t, int32_t, probe_sequence::double_hashing), + (int32_t, int64_t, probe_sequence::double_hashing), + (int64_t, int64_t, probe_sequence::double_hashing)) +{ + constexpr std::size_t num_keys{1'000}; + + thrust::device_vector d_keys(num_keys); + thrust::device_vector> d_pairs(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + // multiplicity = 1 + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_keys), + d_pairs.begin(), + [] __device__(auto i) { + return cuco::pair_type{i, i}; + }); + + if constexpr (Probe == probe_sequence::linear_probing) { + cuco::static_multimap, + cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> + map{num_keys * 2, -1, -1}; + test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); + } + if constexpr (Probe == probe_sequence::double_hashing) { + cuco::static_multimap map{num_keys * 2, -1, -1}; + test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); + } +} diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu new file mode 100644 index 000000000..0942b6d91 --- /dev/null +++ b/tests/static_multimap/multiplicity_test.cu @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include + +#include + +template +__inline__ void test_multiplicity_two( + Map& map, PairIt pair_begin, KeyIt key_begin, ResultIt result_begin, std::size_t num_items) +{ + auto num_keys = num_items / 2; + thrust::device_vector d_contained(num_keys); + + SECTION("Non-inserted key/value pairs should not be contained.") + { + auto size = map.get_size(); + REQUIRE(size == 0); + + map.contains(key_begin, key_begin + num_keys, d_contained.begin()); + + REQUIRE( + none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + } + + map.insert(pair_begin, pair_begin + num_items); + + SECTION("All inserted key/value pairs should be contained.") + { + auto size = map.get_size(); + REQUIRE(size == num_items); + + map.contains(key_begin, key_begin + num_keys, d_contained.begin()); + + REQUIRE( + all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + } + + SECTION("Total count should be equal to the number of inserted pairs.") + { + // Count matching keys + auto num = map.count(key_begin, key_begin + num_keys); + + REQUIRE(num == num_items); + + auto output_begin = result_begin; + auto output_end = map.retrieve(key_begin, key_begin + num_keys, output_begin); + auto size = thrust::distance(output_begin, output_end); + + REQUIRE(size == num_items); + + // sort before compare + thrust::sort(thrust::device, + output_begin, + output_end, + [] __device__(const cuco::pair_type& lhs, + const cuco::pair_type& rhs) { + if (lhs.first != rhs.first) { return lhs.first < rhs.first; } + return lhs.second < rhs.second; + }); + + REQUIRE(thrust::equal( + thrust::device, + pair_begin, + pair_begin + num_items, + output_begin, + [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); + } + + SECTION("count and count_outer should return the same value.") + { + auto num = map.count(key_begin, key_begin + num_keys); + auto num_outer = map.count_outer(key_begin, key_begin + num_keys); + + REQUIRE(num == num_outer); + } + + SECTION("Output of retrieve and retrieve_outer should be the same.") + { + auto output_begin = result_begin; + auto output_end = map.retrieve(key_begin, key_begin + num_keys, output_begin); + auto size = thrust::distance(output_begin, output_end); + + output_end = map.retrieve_outer(key_begin, key_begin + num_keys, output_begin); + auto size_outer = thrust::distance(output_begin, output_end); + + REQUIRE(size == size_outer); + + // sort before compare + thrust::sort(thrust::device, + output_begin, + output_end, + [] __device__(const cuco::pair_type& lhs, + const cuco::pair_type& rhs) { + if (lhs.first != rhs.first) { return lhs.first < rhs.first; } + return lhs.second < rhs.second; + }); + + REQUIRE(thrust::equal( + thrust::device, + pair_begin, + pair_begin + num_items, + output_begin, + [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); + } +} + +TEMPLATE_TEST_CASE_SIG("Multiplicity equals two", + "", + ((typename Key, typename Value, probe_sequence Probe), Key, Value, Probe), + (int32_t, int32_t, probe_sequence::linear_probing), + (int32_t, int64_t, probe_sequence::linear_probing), + (int64_t, int64_t, probe_sequence::linear_probing), + (int32_t, int32_t, probe_sequence::double_hashing), + (int32_t, int64_t, probe_sequence::double_hashing), + (int64_t, int64_t, probe_sequence::double_hashing)) +{ + constexpr std::size_t num_items{4}; + + thrust::device_vector d_keys(num_items / 2); + thrust::device_vector> d_pairs(num_items); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + // multiplicity = 2 + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_items), + d_pairs.begin(), + [] __device__(auto i) { + return cuco::pair_type{i / 2, i}; + }); + + thrust::device_vector> d_results(num_items); + + if constexpr (Probe == probe_sequence::linear_probing) { + cuco::static_multimap, + cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> + map{5, -1, -1}; + test_multiplicity_two( + map, d_pairs.begin(), d_keys.begin(), d_results.begin(), num_items); + } + if constexpr (Probe == probe_sequence::double_hashing) { + cuco::static_multimap map{5, -1, -1}; + test_multiplicity_two( + map, d_pairs.begin(), d_keys.begin(), d_results.begin(), num_items); + } +} diff --git a/tests/static_multimap/non_match_test.cu b/tests/static_multimap/non_match_test.cu new file mode 100644 index 000000000..1ff463bc3 --- /dev/null +++ b/tests/static_multimap/non_match_test.cu @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include + +#include + +template +__inline__ void test_non_matches(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t num_keys) +{ + map.insert(pair_begin, pair_begin + num_keys); + + auto res = map.get_size(); + REQUIRE(res == num_keys); + + SECTION("Output of count and retrieve should be coherent.") + { + auto num = map.count(key_begin, key_begin + num_keys); + thrust::device_vector> d_results(num); + + REQUIRE(num == num_keys); + + auto output_begin = d_results.data().get(); + auto output_end = map.retrieve(key_begin, key_begin + num_keys, output_begin); + auto size = thrust::distance(output_begin, output_end); + + REQUIRE(size == num_keys); + + // sort before compare + thrust::sort(thrust::device, + output_begin, + output_end, + [] __device__(const cuco::pair_type& lhs, + const cuco::pair_type& rhs) { + if (lhs.first != rhs.first) { return lhs.first < rhs.first; } + return lhs.second < rhs.second; + }); + + REQUIRE(thrust::equal( + thrust::device, + pair_begin, + pair_begin + num_keys, + output_begin, + [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); + } + + SECTION("Output of count_outer and retrieve_outer should be coherent.") + { + auto num = map.count_outer(key_begin, key_begin + num_keys); + thrust::device_vector> d_results(num); + + REQUIRE(num == (num_keys + num_keys / 2)); + + auto output_begin = d_results.data().get(); + auto output_end = map.retrieve_outer(key_begin, key_begin + num_keys, output_begin); + auto size = thrust::distance(output_begin, output_end); + + REQUIRE(size == (num_keys + num_keys / 2)); + + // sort before compare + thrust::sort(thrust::device, + output_begin, + output_end, + [] __device__(const cuco::pair_type& lhs, + const cuco::pair_type& rhs) { + if (lhs.first != rhs.first) { return lhs.first < rhs.first; } + return lhs.second < rhs.second; + }); + + // create gold reference + thrust::device_vector> gold(size); + auto gold_begin = gold.begin(); + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(size), + gold_begin, + [num_keys] __device__(auto i) { + if (i < num_keys) { return cuco::pair_type{i / 2, i}; } + return cuco::pair_type{i - num_keys / 2, -1}; + }); + + REQUIRE(thrust::equal( + thrust::device, + gold_begin, + gold_begin + size, + output_begin, + [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); + } +} + +TEMPLATE_TEST_CASE_SIG("Tests of non-matches", + "", + ((typename Key, typename Value, probe_sequence Probe), Key, Value, Probe), + (int32_t, int32_t, probe_sequence::linear_probing), + (int32_t, int64_t, probe_sequence::linear_probing), + (int64_t, int64_t, probe_sequence::linear_probing), + (int32_t, int32_t, probe_sequence::double_hashing), + (int32_t, int64_t, probe_sequence::double_hashing), + (int64_t, int64_t, probe_sequence::double_hashing)) +{ + constexpr std::size_t num_keys{1'000'000}; + + thrust::device_vector d_keys(num_keys); + thrust::device_vector> d_pairs(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + // multiplicity = 2 + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_keys), + d_pairs.begin(), + [] __device__(auto i) { + return cuco::pair_type{i / 2, i}; + }); + + if constexpr (Probe == probe_sequence::linear_probing) { + cuco::static_multimap, + cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> + map{num_keys * 2, -1, -1}; + test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); + } + if constexpr (Probe == probe_sequence::double_hashing) { + cuco::static_multimap map{num_keys * 2, -1, -1}; + test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); + } +} diff --git a/tests/static_multimap/pair_function_test.cu b/tests/static_multimap/pair_function_test.cu new file mode 100644 index 000000000..39b454898 --- /dev/null +++ b/tests/static_multimap/pair_function_test.cu @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include + +#include + +// Custom pair equal +template +struct pair_equal { + __device__ bool operator()(const cuco::pair_type& lhs, + const cuco::pair_type& rhs) const + { + return lhs.first == rhs.first; + } +}; + +template +__inline__ void test_pair_functions(Map& map, PairIt pair_begin, std::size_t num_pairs) +{ + map.insert(pair_begin, pair_begin + num_pairs); + cudaStreamSynchronize(0); + + auto res = map.get_size(); + REQUIRE(res == num_pairs); + + // query pair matching rate = 50% + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_pairs), + pair_begin, + [] __device__(auto i) { + return cuco::pair_type{i, i}; + }); + + SECTION("Output of pair_count and pair_retrieve should be coherent.") + { + auto num = map.pair_count(pair_begin, pair_begin + num_pairs, pair_equal{}); + + auto out1_begin = thrust::make_zip_iterator( + thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator())); + auto out2_begin = thrust::make_zip_iterator( + thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator())); + + REQUIRE(num == num_pairs); + + auto [out1_end, out2_end] = map.pair_retrieve( + pair_begin, pair_begin + num_pairs, out1_begin, out2_begin, pair_equal{}); + + REQUIRE((out1_end - out1_begin) == num_pairs); + } + + SECTION("Output of pair_count_outer and pair_retrieve_outer should be coherent.") + { + auto num = map.pair_count_outer(pair_begin, pair_begin + num_pairs, pair_equal{}); + + auto out1_begin = thrust::make_zip_iterator( + thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator())); + auto out2_begin = thrust::make_zip_iterator( + thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator())); + + REQUIRE(num == (num_pairs + num_pairs / 2)); + + auto [out1_end, out2_end] = map.pair_retrieve_outer( + pair_begin, pair_begin + num_pairs, out1_begin, out2_begin, pair_equal{}); + + REQUIRE((out1_end - out1_begin) == (num_pairs + num_pairs / 2)); + } +} + +TEMPLATE_TEST_CASE_SIG("Tests of pair functions", + "", + ((typename Key, typename Value, probe_sequence Probe), Key, Value, Probe), + (int32_t, int32_t, probe_sequence::linear_probing), + (int32_t, int64_t, probe_sequence::linear_probing), + (int64_t, int64_t, probe_sequence::linear_probing), + (int32_t, int32_t, probe_sequence::double_hashing), + (int32_t, int64_t, probe_sequence::double_hashing), + (int64_t, int64_t, probe_sequence::double_hashing)) +{ + constexpr std::size_t num_pairs{4}; + thrust::device_vector> d_pairs(num_pairs); + + // pair multiplicity = 2 + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_pairs), + d_pairs.begin(), + [] __device__(auto i) { + return cuco::pair_type{i / 2, i}; + }); + + if constexpr (Probe == probe_sequence::linear_probing) { + cuco::static_multimap, + cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> + map{num_pairs * 2, -1, -1}; + test_pair_functions(map, d_pairs.begin(), num_pairs); + } + if constexpr (Probe == probe_sequence::double_hashing) { + cuco::static_multimap map{num_pairs * 2, -1, -1}; + test_pair_functions(map, d_pairs.begin(), num_pairs); + } +} diff --git a/tests/static_multimap/static_multimap_test.cu b/tests/static_multimap/static_multimap_test.cu deleted file mode 100644 index 0140d3223..000000000 --- a/tests/static_multimap/static_multimap_test.cu +++ /dev/null @@ -1,701 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include - -#include -#include -#include -#include - -#include - -namespace { -namespace cg = cooperative_groups; - -// Thrust logical algorithms (any_of/all_of/none_of) don't work with device -// lambdas: See https://github.com/thrust/thrust/issues/1062 -template -bool all_of(Iterator begin, Iterator end, Predicate p) -{ - auto size = thrust::distance(begin, end); - return size == thrust::count_if(begin, end, p); -} - -template -bool any_of(Iterator begin, Iterator end, Predicate p) -{ - return thrust::count_if(begin, end, p) > 0; -} - -template -bool none_of(Iterator begin, Iterator end, Predicate p) -{ - return not all_of(begin, end, p); -} - -} // namespace - -enum class probe_sequence { linear_probing, double_hashing }; - -// User-defined key type -struct key_pair { - int32_t a; - int32_t b; - __device__ bool operator!=(key_pair const& other) const { return a != other.a and b != other.b; } -}; - -struct hash_key_pair { - __device__ uint32_t operator()(key_pair k) { return k.a; }; -}; - -struct key_pair_equals { - __device__ bool operator()(const key_pair& lhs, const key_pair& rhs) - { - return std::tie(lhs.a, lhs.b) == std::tie(rhs.a, rhs.b); - } -}; - -struct value_pair { - int32_t f; - int32_t s; -}; - -template -__inline__ void test_custom_key_value_type(Map& map, - PairIt pair_begin, - KeyIt key_begin, - size_t num_pairs) -{ - constexpr cudaStream_t stream = 0; - - SECTION("All inserted keys-value pairs should be correctly recovered during find") - { - map.insert(pair_begin, pair_begin + num_pairs); - - auto res = map.get_size(); - REQUIRE(res == num_pairs); - - auto count = map.count(key_begin, key_begin + num_pairs, stream, key_pair_equals{}); - REQUIRE(count == num_pairs); - - thrust::device_vector> found_pairs(num_pairs); - auto output_end = map.retrieve( - key_begin, key_begin + num_pairs, found_pairs.begin(), stream, key_pair_equals{}); - auto size = output_end - found_pairs.begin(); - - REQUIRE(size == num_pairs); - - // sort before compare - thrust::sort( - thrust::device, - found_pairs.begin(), - found_pairs.end(), - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { return lhs.first.a < rhs.first.a; }); - - REQUIRE(thrust::equal( - thrust::device, - pair_begin, - pair_begin + num_pairs, - found_pairs.begin(), - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first.a == rhs.first.a; - })); - } - - SECTION("Non-matches are not included in the output") - { - map.insert(pair_begin, pair_begin + num_pairs); - - auto const num = num_pairs * 2; - thrust::device_vector query_keys(num); - auto query_key_begin = query_keys.begin(); - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num), - query_key_begin, - [] __device__(auto i) { - return Key{i, i}; - }); - - auto count = map.count(query_key_begin, query_key_begin + num, stream, key_pair_equals{}); - REQUIRE(count == num_pairs); - - thrust::device_vector> found_pairs(num_pairs); - auto output_end = map.retrieve( - query_key_begin, query_key_begin + num, found_pairs.begin(), stream, key_pair_equals{}); - auto size = output_end - found_pairs.begin(); - - REQUIRE(size == num_pairs); - - // sort before compare - thrust::sort( - thrust::device, - found_pairs.begin(), - found_pairs.end(), - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { return lhs.first.a < rhs.first.a; }); - - REQUIRE(thrust::equal( - thrust::device, - pair_begin, - pair_begin + num_pairs, - found_pairs.begin(), - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first.a == rhs.first.a; - })); - } - - SECTION("Outer functions include non-matches in the output") - { - map.insert(pair_begin, pair_begin + num_pairs); - - auto const num = num_pairs * 2; - thrust::device_vector query_keys(num); - auto query_key_begin = query_keys.begin(); - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num), - query_key_begin, - [] __device__(auto i) { - return Key{i, i}; - }); - - auto count_outer = - map.count_outer(query_key_begin, query_key_begin + num, stream, key_pair_equals{}); - REQUIRE(count_outer == num); - - thrust::device_vector> found_pairs(num); - auto output_end = map.retrieve_outer( - query_key_begin, query_key_begin + num, found_pairs.begin(), stream, key_pair_equals{}); - auto size_outer = output_end - found_pairs.begin(); - - REQUIRE(size_outer == num); - } - - SECTION("All inserted keys-value pairs should be contained") - { - map.insert(pair_begin, pair_begin + num_pairs); - - auto size = map.get_size(); - REQUIRE(size == num_pairs); - - thrust::device_vector contained(num_pairs); - map.contains(key_begin, key_begin + num_pairs, contained.begin(), stream, key_pair_equals{}); - REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool const& b) { return b; })); - } - - SECTION("Non-inserted keys-value pairs should not be contained") - { - auto size = map.get_size(); - REQUIRE(size == 0); - - thrust::device_vector contained(num_pairs); - map.contains(key_begin, key_begin + num_pairs, contained.begin(), stream, key_pair_equals{}); - REQUIRE( - none_of(contained.begin(), contained.end(), [] __device__(bool const& b) { return b; })); - } -} - -TEMPLATE_TEST_CASE_SIG("User defined key and value type", - "", - ((probe_sequence Probe), Probe), - (probe_sequence::linear_probing), - (probe_sequence::double_hashing)) -{ - using Key = key_pair; - using Value = value_pair; - - auto constexpr sentinel_key = Key{-1, -1}; - auto constexpr sentinel_value = Value{-1, -1}; - - constexpr std::size_t num_pairs = 100; - constexpr std::size_t capacity = num_pairs * 2; - - thrust::device_vector insert_keys(num_pairs); - thrust::device_vector insert_values(num_pairs); - - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_pairs), - insert_keys.begin(), - [] __device__(auto i) { - return Key{i, i}; - }); - - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_pairs), - insert_values.begin(), - [] __device__(auto i) { - return Value{i, i}; - }); - auto insert_pairs = - thrust::make_zip_iterator(thrust::make_tuple(insert_keys.begin(), insert_values.begin())); - - if constexpr (Probe == probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, hash_key_pair>> - map{capacity, sentinel_key, sentinel_value}; - test_custom_key_value_type(map, insert_pairs, insert_keys.begin(), num_pairs); - } - if constexpr (Probe == probe_sequence::double_hashing) { - cuco::static_multimap map{capacity, sentinel_key, sentinel_value}; - test_custom_key_value_type(map, insert_pairs, insert_keys.begin(), num_pairs); - } -} - -template -__inline__ void test_multiplicity_two( - Map& map, PairIt pair_begin, KeyIt key_begin, ResultIt result_begin, std::size_t num_items) -{ - auto num_keys = num_items / 2; - thrust::device_vector d_contained(num_keys); - - SECTION("Non-inserted key/value pairs should not be contained.") - { - auto size = map.get_size(); - REQUIRE(size == 0); - - map.contains(key_begin, key_begin + num_keys, d_contained.begin()); - - REQUIRE( - none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); - } - - map.insert(pair_begin, pair_begin + num_items); - - SECTION("All inserted key/value pairs should be contained.") - { - auto size = map.get_size(); - REQUIRE(size == num_items); - - map.contains(key_begin, key_begin + num_keys, d_contained.begin()); - - REQUIRE( - all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); - } - - SECTION("Total count should be equal to the number of inserted pairs.") - { - // Count matching keys - auto num = map.count(key_begin, key_begin + num_keys); - - REQUIRE(num == num_items); - - auto output_begin = result_begin; - auto output_end = map.retrieve(key_begin, key_begin + num_keys, output_begin); - auto size = thrust::distance(output_begin, output_end); - - REQUIRE(size == num_items); - - // sort before compare - thrust::sort(thrust::device, - output_begin, - output_end, - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { - if (lhs.first != rhs.first) { return lhs.first < rhs.first; } - return lhs.second < rhs.second; - }); - - REQUIRE(thrust::equal( - thrust::device, - pair_begin, - pair_begin + num_items, - output_begin, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); - } - - SECTION("count and count_outer should return the same value.") - { - auto num = map.count(key_begin, key_begin + num_keys); - auto num_outer = map.count_outer(key_begin, key_begin + num_keys); - - REQUIRE(num == num_outer); - } - - SECTION("Output of retrieve and retrieve_outer should be the same.") - { - auto output_begin = result_begin; - auto output_end = map.retrieve(key_begin, key_begin + num_keys, output_begin); - auto size = thrust::distance(output_begin, output_end); - - output_end = map.retrieve_outer(key_begin, key_begin + num_keys, output_begin); - auto size_outer = thrust::distance(output_begin, output_end); - - REQUIRE(size == size_outer); - - // sort before compare - thrust::sort(thrust::device, - output_begin, - output_end, - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { - if (lhs.first != rhs.first) { return lhs.first < rhs.first; } - return lhs.second < rhs.second; - }); - - REQUIRE(thrust::equal( - thrust::device, - pair_begin, - pair_begin + num_items, - output_begin, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); - } -} - -TEMPLATE_TEST_CASE_SIG("Multiplicity equals two", - "", - ((typename Key, typename Value, probe_sequence Probe), Key, Value, Probe), - (int32_t, int32_t, probe_sequence::linear_probing), - (int32_t, int64_t, probe_sequence::linear_probing), - (int64_t, int64_t, probe_sequence::linear_probing), - (int32_t, int32_t, probe_sequence::double_hashing), - (int32_t, int64_t, probe_sequence::double_hashing), - (int64_t, int64_t, probe_sequence::double_hashing)) -{ - constexpr std::size_t num_items{4}; - - thrust::device_vector d_keys(num_items / 2); - thrust::device_vector> d_pairs(num_items); - - thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); - // multiplicity = 2 - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_items), - d_pairs.begin(), - [] __device__(auto i) { - return cuco::pair_type{i / 2, i}; - }); - - thrust::device_vector> d_results(num_items); - - if constexpr (Probe == probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{5, -1, -1}; - test_multiplicity_two( - map, d_pairs.begin(), d_keys.begin(), d_results.begin(), num_items); - } - if constexpr (Probe == probe_sequence::double_hashing) { - cuco::static_multimap map{5, -1, -1}; - test_multiplicity_two( - map, d_pairs.begin(), d_keys.begin(), d_results.begin(), num_items); - } -} - -template -__inline__ void test_non_matches(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t num_keys) -{ - map.insert(pair_begin, pair_begin + num_keys); - - auto res = map.get_size(); - REQUIRE(res == num_keys); - - SECTION("Output of count and retrieve should be coherent.") - { - auto num = map.count(key_begin, key_begin + num_keys); - thrust::device_vector> d_results(num); - - REQUIRE(num == num_keys); - - auto output_begin = d_results.data().get(); - auto output_end = map.retrieve(key_begin, key_begin + num_keys, output_begin); - auto size = thrust::distance(output_begin, output_end); - - REQUIRE(size == num_keys); - - // sort before compare - thrust::sort(thrust::device, - output_begin, - output_end, - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { - if (lhs.first != rhs.first) { return lhs.first < rhs.first; } - return lhs.second < rhs.second; - }); - - REQUIRE(thrust::equal( - thrust::device, - pair_begin, - pair_begin + num_keys, - output_begin, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); - } - - SECTION("Output of count_outer and retrieve_outer should be coherent.") - { - auto num = map.count_outer(key_begin, key_begin + num_keys); - thrust::device_vector> d_results(num); - - REQUIRE(num == (num_keys + num_keys / 2)); - - auto output_begin = d_results.data().get(); - auto output_end = map.retrieve_outer(key_begin, key_begin + num_keys, output_begin); - auto size = thrust::distance(output_begin, output_end); - - REQUIRE(size == (num_keys + num_keys / 2)); - - // sort before compare - thrust::sort(thrust::device, - output_begin, - output_end, - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { - if (lhs.first != rhs.first) { return lhs.first < rhs.first; } - return lhs.second < rhs.second; - }); - - // create gold reference - thrust::device_vector> gold(size); - auto gold_begin = gold.begin(); - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(size), - gold_begin, - [num_keys] __device__(auto i) { - if (i < num_keys) { return cuco::pair_type{i / 2, i}; } - return cuco::pair_type{i - num_keys / 2, -1}; - }); - - REQUIRE(thrust::equal( - thrust::device, - gold_begin, - gold_begin + size, - output_begin, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); - } -} - -TEMPLATE_TEST_CASE_SIG("Tests of non-matches", - "", - ((typename Key, typename Value, probe_sequence Probe), Key, Value, Probe), - (int32_t, int32_t, probe_sequence::linear_probing), - (int32_t, int64_t, probe_sequence::linear_probing), - (int64_t, int64_t, probe_sequence::linear_probing), - (int32_t, int32_t, probe_sequence::double_hashing), - (int32_t, int64_t, probe_sequence::double_hashing), - (int64_t, int64_t, probe_sequence::double_hashing)) -{ - constexpr std::size_t num_keys{1'000'000}; - - thrust::device_vector d_keys(num_keys); - thrust::device_vector> d_pairs(num_keys); - - thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); - // multiplicity = 2 - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_keys), - d_pairs.begin(), - [] __device__(auto i) { - return cuco::pair_type{i / 2, i}; - }); - - if constexpr (Probe == probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{num_keys * 2, -1, -1}; - test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); - } - if constexpr (Probe == probe_sequence::double_hashing) { - cuco::static_multimap map{num_keys * 2, -1, -1}; - test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); - } -} - -template -__inline__ void test_insert_if(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t size) -{ - // 50% insertion - auto pred_lambda = [] __device__(Key k) { return k % 2 == 0; }; - - map.insert_if(pair_begin, pair_begin + size, key_begin, pred_lambda); - - auto res = map.get_size(); - REQUIRE(res * 2 == size); - - auto num = map.count(key_begin, key_begin + size); - REQUIRE(num * 2 == size); -} - -TEMPLATE_TEST_CASE_SIG("Tests of insert_if", - "", - ((typename Key, typename Value, probe_sequence Probe), Key, Value, Probe), - (int32_t, int32_t, probe_sequence::linear_probing), - (int32_t, int64_t, probe_sequence::linear_probing), - (int64_t, int64_t, probe_sequence::linear_probing), - (int32_t, int32_t, probe_sequence::double_hashing), - (int32_t, int64_t, probe_sequence::double_hashing), - (int64_t, int64_t, probe_sequence::double_hashing)) -{ - constexpr std::size_t num_keys{1'000}; - - thrust::device_vector d_keys(num_keys); - thrust::device_vector> d_pairs(num_keys); - - thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); - // multiplicity = 1 - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_keys), - d_pairs.begin(), - [] __device__(auto i) { - return cuco::pair_type{i, i}; - }); - - if constexpr (Probe == probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{num_keys * 2, -1, -1}; - test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); - } - if constexpr (Probe == probe_sequence::double_hashing) { - cuco::static_multimap map{num_keys * 2, -1, -1}; - test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); - } -} - -// Custom pair equal -template -struct pair_equal { - __device__ bool operator()(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) const - { - return lhs.first == rhs.first; - } -}; - -template -__inline__ void test_pair_functions(Map& map, PairIt pair_begin, std::size_t num_pairs) -{ - map.insert(pair_begin, pair_begin + num_pairs); - cudaStreamSynchronize(0); - - auto res = map.get_size(); - REQUIRE(res == num_pairs); - - // query pair matching rate = 50% - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_pairs), - pair_begin, - [] __device__(auto i) { - return cuco::pair_type{i, i}; - }); - - SECTION("Output of pair_count and pair_retrieve should be coherent.") - { - auto num = map.pair_count(pair_begin, pair_begin + num_pairs, pair_equal{}); - - auto out1_begin = thrust::make_zip_iterator( - thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator())); - auto out2_begin = thrust::make_zip_iterator( - thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator())); - - REQUIRE(num == num_pairs); - - auto [out1_end, out2_end] = map.pair_retrieve( - pair_begin, pair_begin + num_pairs, out1_begin, out2_begin, pair_equal{}); - - REQUIRE((out1_end - out1_begin) == num_pairs); - } - - SECTION("Output of pair_count_outer and pair_retrieve_outer should be coherent.") - { - auto num = map.pair_count_outer(pair_begin, pair_begin + num_pairs, pair_equal{}); - - auto out1_begin = thrust::make_zip_iterator( - thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator())); - auto out2_begin = thrust::make_zip_iterator( - thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator())); - - REQUIRE(num == (num_pairs + num_pairs / 2)); - - auto [out1_end, out2_end] = map.pair_retrieve_outer( - pair_begin, pair_begin + num_pairs, out1_begin, out2_begin, pair_equal{}); - - REQUIRE((out1_end - out1_begin) == (num_pairs + num_pairs / 2)); - } -} - -TEMPLATE_TEST_CASE_SIG("Tests of pair functions", - "", - ((typename Key, typename Value, probe_sequence Probe), Key, Value, Probe), - (int32_t, int32_t, probe_sequence::linear_probing), - (int32_t, int64_t, probe_sequence::linear_probing), - (int64_t, int64_t, probe_sequence::linear_probing), - (int32_t, int32_t, probe_sequence::double_hashing), - (int32_t, int64_t, probe_sequence::double_hashing), - (int64_t, int64_t, probe_sequence::double_hashing)) -{ - constexpr std::size_t num_pairs{4}; - thrust::device_vector> d_pairs(num_pairs); - - // pair multiplicity = 2 - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_pairs), - d_pairs.begin(), - [] __device__(auto i) { - return cuco::pair_type{i / 2, i}; - }); - - if constexpr (Probe == probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{num_pairs * 2, -1, -1}; - test_pair_functions(map, d_pairs.begin(), num_pairs); - } - if constexpr (Probe == probe_sequence::double_hashing) { - cuco::static_multimap map{num_pairs * 2, -1, -1}; - test_pair_functions(map, d_pairs.begin(), num_pairs); - } -} diff --git a/tests/util.hpp b/tests/util.hpp index 44d4b8bc3..b1e2ff75d 100644 --- a/tests/util.hpp +++ b/tests/util.hpp @@ -20,6 +20,8 @@ namespace cg = cooperative_groups; +enum class probe_sequence { linear_probing, double_hashing }; + // User-defined logical algorithms to reduce compilation time template bool all_of(Iterator begin, Iterator end, Predicate p) From d84dcee2973deff64fe0059d6f00569eb8c35c1e Mon Sep 17 00:00:00 2001 From: Chirayu Date: Mon, 8 Nov 2021 08:48:56 -0800 Subject: [PATCH 24/70] Add unit test, address review comments --- examples/static_map/static_map_example.cu | 6 +++++- include/cuco/detail/static_map.inl | 22 ++++++++++------------ include/cuco/static_map.cuh | 12 ++++++++---- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/examples/static_map/static_map_example.cu b/examples/static_map/static_map_example.cu index 8e1b5a4b4..42c8080de 100644 --- a/examples/static_map/static_map_example.cu +++ b/examples/static_map/static_map_example.cu @@ -30,7 +30,9 @@ int main(void) // Constructs a map with 100,000 slots using -1 and -1 as the empty key/value // sentinels. Note the capacity is chosen knowing we will insert 50,000 keys, // for an load factor of 50%. - cuco::static_map map{100'000, empty_key_sentinel, empty_value_sentinel}; + cudaStream_t str; + cudaStreamCreate(&str); + cuco::static_map map{100'000, empty_key_sentinel, empty_value_sentinel, cuco::cuda_allocator{}, str}; thrust::device_vector> pairs(50'000); @@ -42,6 +44,7 @@ int main(void) // Inserts all pairs into the map map.insert(pairs.begin(), pairs.end()); + cudaStreamSynchronize(str); // Sequence of keys {0, 1, 2, ...} thrust::device_vector keys_to_find(50'000); @@ -51,6 +54,7 @@ int main(void) // Finds all keys {0, 1, 2, ...} and stores associated values into `found_values` // If a key `keys_to_find[i]` doesn't exist, `found_values[i] == empty_value_sentinel` map.find(keys_to_find.begin(), keys_to_find.end(), found_values.begin()); + cudaStreamSynchronize(str); return 0; } diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 0826ff30d..0821e31ee 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -32,10 +32,8 @@ static_map::static_map(std::size_t capacity, exec_stream_{stream} { // allocator should allocate memory accessible by the exec_stream_ - slots_ = std::allocator_traits::allocate(slot_allocator_, capacity_, - exec_stream_); - num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1, - exec_stream_); + slots_ = slot_allocator_.allocate(capacity_, exec_stream_); + num_successes_ = counter_allocator_.allocate(1, exec_stream_); auto constexpr block_size = 256; auto constexpr stride = 4; @@ -49,8 +47,8 @@ template ::~static_map() { // use exec_stream_ parameter param - std::allocator_traits::deallocate(slot_allocator_, slots_, capacity_); - std::allocator_traits::deallocate(counter_allocator_, num_successes_, 1); + slot_allocator_.deallocate(slots_, capacity_, exec_stream_); + counter_allocator_.deallocate(num_successes_, 1, exec_stream_); } template @@ -82,8 +80,8 @@ void static_map::insert(InputIt first, &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); // stream'd execution assumes sync not required - if (stream == NULL) - CUCO_CUDA_TRY(cudaDeviceSynchronize()); // ensures legacy behavior + if (stream == 0) + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // ensures legacy behavior size_ += h_num_successes; } @@ -106,8 +104,8 @@ void static_map::find( detail::find <<>>(first, last, output_begin, view, hash, key_equal); // stream'd execution assumes sync not required - if (stream == NULL) - CUCO_CUDA_TRY(cudaDeviceSynchronize()); // ensures legacy behavior + if (stream == 0) + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // ensures legacy behavior } template @@ -128,8 +126,8 @@ void static_map::contains( detail::contains <<>>(first, last, output_begin, view, hash, key_equal); // stream'd execution assumes sync not required - if (stream == NULL) - CUCO_CUDA_TRY(cudaDeviceSynchronize()); // ensures legacy behavior + if (stream == 0) + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // ensures legacy behavior } template diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index a9bd1a207..3e61e797a 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -189,12 +189,13 @@ class static_map { * @param empty_key_sentinel The reserved key value for empty slots * @param empty_value_sentinel The reserved mapped value for empty slots * @param alloc Allocator used for allocating device storage + * @param stream Stream used for executing the kernels, blocking execution for NULL stream */ static_map(std::size_t capacity, Key empty_key_sentinel, Value empty_value_sentinel, Allocator const& alloc = Allocator{}, - cudaStream_t stream = NULL); + cudaStream_t stream = 0); /** * @brief Destroys the map and frees its contents. @@ -216,12 +217,13 @@ class static_map { * @param last End of the sequence of key/value pairs * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality + * @param stream Stream used for executing the kernels, blocking execution for NULL stream */ template , typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = NULL); + cudaStream_t stream = 0); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. @@ -240,6 +242,7 @@ class static_map { * @param output_begin Beginning of the sequence of values retrieved for each key * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality + * @param stream Stream used for executing the kernels, blocking execution for NULL stream */ template Date: Mon, 8 Nov 2021 18:28:03 -0500 Subject: [PATCH 25/70] Exclude unused headers + use make_transform_iterator instead of transform --- tests/dynamic_map/unique_sequence_test.cu | 19 +++---- tests/static_map/custom_type_test.cu | 20 ++----- tests/static_map/key_sentinel_test.cu | 3 -- tests/static_map/shared_memory_test.cu | 4 +- tests/static_map/unique_sequence_test.cu | 58 ++++++++++----------- tests/static_multimap/custom_type_test.cu | 3 -- tests/static_multimap/insert_if_test.cu | 3 -- tests/static_multimap/multiplicity_test.cu | 3 -- tests/static_multimap/non_match_test.cu | 3 -- tests/static_multimap/pair_function_test.cu | 3 -- 10 files changed, 38 insertions(+), 81 deletions(-) diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu index d166ab0a7..889a77859 100644 --- a/tests/dynamic_map/unique_sequence_test.cu +++ b/tests/dynamic_map/unique_sequence_test.cu @@ -15,9 +15,8 @@ */ #include -#include - #include + #include #include @@ -35,17 +34,13 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); - thrust::device_vector> d_pairs(num_keys); thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); thrust::sequence(thrust::device, d_values.begin(), d_values.end()); - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_keys), - d_pairs.begin(), - [] __device__(auto i) { - return cuco::pair_type{i, i}; - }); + + auto pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair_type(i, i); }); thrust::device_vector d_results(num_keys); thrust::device_vector d_contained(num_keys); @@ -53,7 +48,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", // bulk function test cases SECTION("All inserted keys-value pairs should be correctly recovered during find") { - map.insert(d_pairs.begin(), d_pairs.end()); + map.insert(pairs_begin, pairs_begin + num_keys); map.find(d_keys.begin(), d_keys.end(), d_results.begin()); auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); @@ -72,7 +67,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", SECTION("All inserted keys-value pairs should be contained") { - map.insert(d_pairs.begin(), d_pairs.end()); + map.insert(pairs_begin, pairs_begin + num_keys); map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); REQUIRE( diff --git a/tests/static_map/custom_type_test.cu b/tests/static_map/custom_type_test.cu index 4d44cc82b..04898ce74 100644 --- a/tests/static_map/custom_type_test.cu +++ b/tests/static_map/custom_type_test.cu @@ -15,9 +15,6 @@ */ #include -#include -#include - #include #include @@ -112,20 +109,9 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", thrust::device_vector insert_keys(num); thrust::device_vector insert_values(num); - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num), - insert_keys.begin(), - [] __device__(auto i) { return Key{i}; }); - - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num), - insert_values.begin(), - [] __device__(auto i) { return Value{i}; }); - - auto insert_pairs = - thrust::make_zip_iterator(thrust::make_tuple(insert_keys.begin(), insert_values.begin())); + auto insert_pairs = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair_type(i, i); }); SECTION("All inserted keys-value pairs should be correctly recovered during find") { diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu index 40f130c22..55c862495 100644 --- a/tests/static_map/key_sentinel_test.cu +++ b/tests/static_map/key_sentinel_test.cu @@ -15,9 +15,6 @@ */ #include -#include -#include - #include #include diff --git a/tests/static_map/shared_memory_test.cu b/tests/static_map/shared_memory_test.cu index 52a86f862..eedf74863 100644 --- a/tests/static_map/shared_memory_test.cu +++ b/tests/static_map/shared_memory_test.cu @@ -17,10 +17,8 @@ #include #include -#include -#include - #include + #include #include diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu index 624020d5a..1b2ce71af 100644 --- a/tests/static_map/unique_sequence_test.cu +++ b/tests/static_map/unique_sequence_test.cu @@ -15,10 +15,8 @@ */ #include -#include -#include - #include + #include #include @@ -39,17 +37,13 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); - thrust::device_vector> d_pairs(num_keys); thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); thrust::sequence(thrust::device, d_values.begin(), d_values.end()); - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_keys), - d_pairs.begin(), - [] __device__(auto i) { - return cuco::pair_type{i, i}; - }); + + auto pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair_type(i, i); }); thrust::device_vector d_results(num_keys); thrust::device_vector d_contained(num_keys); @@ -57,7 +51,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", // bulk function test cases SECTION("All inserted keys-value pairs should be correctly recovered during find") { - map.insert(d_pairs.begin(), d_pairs.end()); + map.insert(pairs_begin, pairs_begin + num_keys); map.find(d_keys.begin(), d_keys.end(), d_results.begin()); auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); @@ -68,7 +62,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", SECTION("All inserted keys-value pairs should be contained") { - map.insert(d_pairs.begin(), d_pairs.end()); + map.insert(pairs_begin, pairs_begin + num_keys); map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); REQUIRE( @@ -85,8 +79,8 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", SECTION("Inserting unique keys should return insert success.") { - REQUIRE(all_of(d_pairs.begin(), - d_pairs.end(), + REQUIRE(all_of(pairs_begin, + pairs_begin + num_keys, [m_view] __device__(cuco::pair_type const& pair) mutable { return m_view.insert(pair); })); @@ -96,18 +90,19 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", { SECTION("non-const view") { - REQUIRE(all_of(d_pairs.begin(), - d_pairs.end(), + REQUIRE(all_of(pairs_begin, + pairs_begin + num_keys, [view] __device__(cuco::pair_type const& pair) mutable { return view.find(pair.first) == view.end(); })); } SECTION("const view") { - REQUIRE(all_of( - d_pairs.begin(), d_pairs.end(), [view] __device__(cuco::pair_type const& pair) { - return view.find(pair.first) == view.end(); - })); + REQUIRE(all_of(pairs_begin, + pairs_begin + num_keys, + [view] __device__(cuco::pair_type const& pair) { + return view.find(pair.first) == view.end(); + })); } } @@ -115,8 +110,8 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", { // Bulk insert keys thrust::for_each(thrust::device, - d_pairs.begin(), - d_pairs.end(), + pairs_begin, + pairs_begin + num_keys, [m_view] __device__(cuco::pair_type const& pair) mutable { m_view.insert(pair); }); @@ -124,8 +119,8 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", SECTION("non-const view") { // All keys should be found - REQUIRE(all_of(d_pairs.begin(), - d_pairs.end(), + REQUIRE(all_of(pairs_begin, + pairs_begin + num_keys, [view] __device__(cuco::pair_type const& pair) mutable { auto const found = view.find(pair.first); return (found != view.end()) and (found->first.load() == pair.first and @@ -135,12 +130,13 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", SECTION("const view") { // All keys should be found - REQUIRE(all_of( - d_pairs.begin(), d_pairs.end(), [view] __device__(cuco::pair_type const& pair) { - auto const found = view.find(pair.first); - return (found != view.end()) and - (found->first.load() == pair.first and found->second.load() == pair.second); - })); + REQUIRE(all_of(pairs_begin, + pairs_begin + num_keys, + [view] __device__(cuco::pair_type const& pair) { + auto const found = view.find(pair.first); + return (found != view.end()) and (found->first.load() == pair.first and + found->second.load() == pair.second); + })); } } } diff --git a/tests/static_multimap/custom_type_test.cu b/tests/static_multimap/custom_type_test.cu index d54ac7f08..31c3c60aa 100644 --- a/tests/static_multimap/custom_type_test.cu +++ b/tests/static_multimap/custom_type_test.cu @@ -15,9 +15,6 @@ */ #include -#include -#include - #include #include diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index e8c8bff02..2a479c648 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -15,9 +15,6 @@ */ #include -#include -#include - #include #include diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu index 0942b6d91..1efd266e5 100644 --- a/tests/static_multimap/multiplicity_test.cu +++ b/tests/static_multimap/multiplicity_test.cu @@ -15,9 +15,6 @@ */ #include -#include -#include - #include #include diff --git a/tests/static_multimap/non_match_test.cu b/tests/static_multimap/non_match_test.cu index 1ff463bc3..a8eb7fa3c 100644 --- a/tests/static_multimap/non_match_test.cu +++ b/tests/static_multimap/non_match_test.cu @@ -15,9 +15,6 @@ */ #include -#include -#include - #include #include diff --git a/tests/static_multimap/pair_function_test.cu b/tests/static_multimap/pair_function_test.cu index 39b454898..caaf1700a 100644 --- a/tests/static_multimap/pair_function_test.cu +++ b/tests/static_multimap/pair_function_test.cu @@ -15,10 +15,7 @@ */ #include -#include #include -#include - #include #include From 792e5546e2c52012b3d75c681bf0b59e97616e8b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 10 Nov 2021 16:44:00 -0800 Subject: [PATCH 26/70] Make contains const and make necessary device view members accessible. --- include/cuco/detail/static_map.inl | 6 +++--- include/cuco/static_map.cuh | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index da5d88db7..15d3aa17b 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -441,14 +441,14 @@ static_map::device_view::find(CG g, template template __device__ bool static_map::device_view::contains( - Key const& k, Hash hash, KeyEqual key_equal) noexcept + Key const& k, Hash hash, KeyEqual key_equal) const noexcept { auto current_slot = initial_slot(k, hash); while (true) { auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed); - if (detail::bitwise_compare(existing_key, empty_key_sentinel_)) { return false; } + if (detail::bitwise_compare(existing_key, this->empty_key_sentinel_)) { return false; } if (key_equal(existing_key, k)) { return true; } @@ -459,7 +459,7 @@ __device__ bool static_map::device_view::contains( template template __device__ bool static_map::device_view::contains( - CG g, Key const& k, Hash hash, KeyEqual key_equal) noexcept + CG g, Key const& k, Hash hash, KeyEqual key_equal) const noexcept { auto current_slot = initial_slot(g, k, hash); diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 582549de3..d8b349658 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -287,11 +287,11 @@ class static_map { using const_iterator = pair_atomic_type const*; using slot_type = slot_type; + Key empty_key_sentinel_{}; ///< Key value that represents an empty slot + Value empty_value_sentinel_{}; ///< Initial Value of empty slot private: pair_atomic_type* slots_{}; ///< Pointer to flat slots storage std::size_t capacity_{}; ///< Total number of slots - Key empty_key_sentinel_{}; ///< Key value that represents an empty slot - Value empty_value_sentinel_{}; ///< Initial Value of empty slot protected: __host__ __device__ device_view_base(pair_atomic_type* slots, @@ -961,7 +961,7 @@ class static_map { typename KeyEqual = thrust::equal_to> __device__ bool contains(Key const& k, Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) noexcept; + KeyEqual key_equal = KeyEqual{}) const noexcept; /** * @brief Indicates whether the key `k` was inserted into the map. @@ -989,7 +989,7 @@ class static_map { __device__ bool contains(CG g, Key const& k, Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) noexcept; + KeyEqual key_equal = KeyEqual{}) const noexcept; }; // class device_view /** From 2373d71842d727249408c1788b04695bf65ae839 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 10 Nov 2021 18:23:38 -0800 Subject: [PATCH 27/70] Add tests for device-side contains. --- include/cuco/static_map.cuh | 2 -- tests/static_map/static_map_test.cu | 12 ++++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index d8b349658..d45a971ab 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -289,11 +289,9 @@ class static_map { Key empty_key_sentinel_{}; ///< Key value that represents an empty slot Value empty_value_sentinel_{}; ///< Initial Value of empty slot - private: pair_atomic_type* slots_{}; ///< Pointer to flat slots storage std::size_t capacity_{}; ///< Total number of slots - protected: __host__ __device__ device_view_base(pair_atomic_type* slots, std::size_t capacity, Key empty_key_sentinel, diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index 779305996..6a1de1012 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -190,6 +190,18 @@ TEST_CASE("User defined key and value type", "") none_of(contained.begin(), contained.end(), [] __device__(bool const& b) { return b; })); } + SECTION("All inserted keys-value pairs should be contained") + { + thrust::device_vector contained(num_pairs); + map.insert(insert_pairs, insert_pairs + num_pairs, hash_key_pair{}, key_pair_equals{}); + auto view = map.get_device_view(); + REQUIRE(all_of(insert_pairs, + insert_pairs + num_pairs, + [view] __device__(cuco::pair_type const& pair) { + return view.contains(pair.first, hash_key_pair{}, key_pair_equals{}); + })); + } + SECTION("Inserting unique keys should return insert success.") { auto m_view = map.get_device_mutable_view(); From 69f2ce6e0a46de47d3043375afb89ad671676519 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 11 Nov 2021 13:32:52 -0800 Subject: [PATCH 28/70] Add static_map::insert_if. --- include/cuco/detail/static_map.inl | 33 +++++++++++++ include/cuco/detail/static_map_kernels.cuh | 54 ++++++++++++++++++++++ include/cuco/static_map.cuh | 29 ++++++++++++ 3 files changed, 116 insertions(+) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 15d3aa17b..8c6bb12e0 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -76,6 +76,39 @@ void static_map::insert(InputIt first, size_ += h_num_successes; } +template +template +void static_map::insert_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + Hash hash, + KeyEqual key_equal) +{ + auto num_keys = std::distance(first, last); + if (num_keys == 0) { return; } + + auto const block_size = 128; + auto const stride = 1; + auto const tile_size = 4; + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto view = get_device_mutable_view(); + + // TODO: memset an atomic variable is unsafe + static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); + CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type))); + std::size_t h_num_successes; + + // TODO: Should I specialize the version with a tile size? + detail::insert_if + <<>>(first, first + num_keys, num_successes_, view, stencil, pred, hash, key_equal); + CUCO_CUDA_TRY(cudaMemcpyAsync( + &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + size_ += h_num_successes; +} + template template void static_map::find( diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index e166de3c6..12e855f08 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -156,6 +156,60 @@ __global__ void insert( if (threadIdx.x == 0) { *num_successes += block_num_successes; } } +/** + * @brief Inserts all key/value pairs in the range `[first, last)`. + * + * If multiple keys in `[first, last)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam block_size + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `value_type` + * @tparam atomicT Type of atomic storage + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param num_successes The number of successfully inserted key/value pairs + * @param view Mutable device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function used to compare two keys for equality + */ +template +__global__ void insert_if( + InputIt first, InputIt last, atomicT* num_successes, viewT view, StencilIt stencil, Predicate pred, Hash hash, KeyEqual key_equal) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + std::size_t thread_num_successes = 0; + + auto tid = block_size * blockIdx.x + threadIdx.x; + auto it = first + tid; + auto i = tid; + + while (it < last) { + if (pred(*(stencil + i))) { + typename viewT::value_type const insert_pair{*it}; + if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; } + it += gridDim.x * block_size; + } + ++i; + } + + // compute number of successfully inserted elements for each block + // and atomically add to the grand total + std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { *num_successes += block_num_successes; } +} + /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. * diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index d45a971ab..cbbdfa3aa 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -221,6 +221,35 @@ class static_map { typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); + /** + * @brief Inserts key/value pairs in the range `[first, last)` if `pred` + * of the corresponding stencil returns true. + * + * The key/value pair `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the map's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from `std::iterator_traits::value_type`. + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for insert + */ + template , + typename KeyEqual = thrust::equal_to> + void insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); + /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. * From e348ada2960840185aeac529ba47cea9eb1cdae9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 11 Nov 2021 15:12:09 -0800 Subject: [PATCH 29/70] Add test of conditional insertion and fix bug. --- include/cuco/detail/static_map_kernels.cuh | 4 ++-- tests/static_map/static_map_test.cu | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 12e855f08..dc80cb4c1 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -199,9 +199,9 @@ __global__ void insert_if( if (pred(*(stencil + i))) { typename viewT::value_type const insert_pair{*it}; if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; } - it += gridDim.x * block_size; } - ++i; + it += gridDim.x * block_size; + i += gridDim.x * block_size; } // compute number of successfully inserted elements for each block diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index 6a1de1012..db3bc4b3e 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -178,6 +178,21 @@ TEST_CASE("User defined key and value type", "") REQUIRE(all_of(contained.begin(), contained.end(), [] __device__(bool const& b) { return b; })); } + SECTION("All conditionally inserted keys-value pairs should be contained") + { + thrust::device_vector contained(num_pairs); + map.insert_if(insert_pairs, insert_pairs + num_pairs, thrust::counting_iterator(0), + [] __device__(auto const& key) { return (key % 2) == 0; }, hash_key_pair{}, key_pair_equals{}); + map.contains(insert_keys.begin(), + insert_keys.end(), + contained.begin(), + hash_key_pair{}, + key_pair_equals{}); + + REQUIRE(thrust::equal(thrust::device, contained.begin(), contained.end(), thrust::counting_iterator(0), + [] __device__(auto const& idx_contained, auto const& idx) { return ((idx % 2) == 0) == idx_contained; })); + } + SECTION("Non-inserted keys-value pairs should not be contained") { thrust::device_vector contained(num_pairs); From f62ab46235867515f8e30a61e156d80c3bd589bd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 12 Nov 2021 11:19:11 -0500 Subject: [PATCH 30/70] Remove unnecessary pragma due to the use of new libcudacxx --- tests/static_map/static_map_test.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index 46155d3d9..3dcd9b5c5 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -24,10 +24,6 @@ #include -// Disable warning for `__shared__ cuda::atomic`: "dynamic initialization is not supported -// for a function-scope static __shared__ variable within a __device__/__global__ function" -#pragma diag_suppress static_var_with_dynamic_init - namespace { namespace cg = cooperative_groups; From 427c55359db3eeddf343d0243a7daf59e1e01360 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 12 Nov 2021 11:21:24 -0500 Subject: [PATCH 31/70] Renaming: custom type example instead of custom type --- examples/CMakeLists.txt | 4 ++-- .../static_map/{custom_type.cu => custom_type_example.cu} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename examples/static_map/{custom_type.cu => custom_type_example.cu} (100%) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 3788c764c..4b87d7150 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -18,12 +18,12 @@ endfunction(ConfigureExample) ################################################################################################### ConfigureExample(STATIC_MAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/static_map_example.cu") -ConfigureExample(CUSTOM_TYPE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type.cu") +ConfigureExample(CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") ConfigureExample(STATIC_MULTIMAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/static_multimap_example.cu") foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) if("${arch}" MATCHES "60") - target_compile_definitions(CUSTOM_TYPE PRIVATE CUCO_NO_INDEPENDENT_THREADS) + target_compile_definitions(CUSTOM_TYPE_EXAMPLE PRIVATE CUCO_NO_INDEPENDENT_THREADS) break() endif() endforeach() diff --git a/examples/static_map/custom_type.cu b/examples/static_map/custom_type_example.cu similarity index 100% rename from examples/static_map/custom_type.cu rename to examples/static_map/custom_type_example.cu From 2142419f0c8a8122b21075d8ccec6466ea127c10 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 12 Nov 2021 15:18:40 -0500 Subject: [PATCH 32/70] Resolve conflicts --- tests/static_map/static_map_test.cu | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index b681be165..11093b41f 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -237,14 +237,13 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", SECTION("All inserted keys-value pairs should be contained") { - thrust::device_vector contained(num_pairs); - map.insert(insert_pairs, insert_pairs + num_pairs, hash_key_pair{}, key_pair_equals{}); + thrust::device_vector contained(num); + map.insert(insert_pairs, insert_pairs + num, hash_custom_key{}, custom_key_equals{}); auto view = map.get_device_view(); - REQUIRE(all_of(insert_pairs, - insert_pairs + num_pairs, - [view] __device__(cuco::pair_type const& pair) { - return view.contains(pair.first, hash_key_pair{}, key_pair_equals{}); - })); + REQUIRE(all_of( + insert_pairs, insert_pairs + num, [view] __device__(cuco::pair_type const& pair) { + return view.contains(pair.first, hash_custom_key{}, custom_key_equals{}); + })); } SECTION("Inserting unique keys should return insert success.") From 720d3bf0767d99532820f0053bd7ab127977eab2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 15 Nov 2021 18:37:36 -0500 Subject: [PATCH 33/70] Allocate/deallocate take no stream argument --- include/cuco/allocator.hpp | 7 ++----- include/cuco/detail/static_multimap/static_multimap.inl | 4 ++-- include/cuco/static_multimap.cuh | 4 ++-- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/include/cuco/allocator.hpp b/include/cuco/allocator.hpp index 634e90423..71d590384 100644 --- a/include/cuco/allocator.hpp +++ b/include/cuco/allocator.hpp @@ -32,17 +32,14 @@ class cuda_allocator { { } - value_type* allocate(std::size_t n, cudaStream_t stream = 0) + value_type* allocate(std::size_t n) { value_type* p; CUCO_CUDA_TRY(cudaMalloc(&p, sizeof(value_type) * n)); return p; } - void deallocate(value_type* p, std::size_t, cudaStream_t stream = 0) - { - CUCO_CUDA_TRY(cudaFree(p)); - } + void deallocate(value_type* p, std::size_t) { CUCO_CUDA_TRY(cudaFree(p)); } }; template diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 6d7c36f00..1bdfd091b 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -54,8 +54,8 @@ static_multimap::static_multimap( stream_{stream}, delete_counter_{counter_allocator_, stream_}, delete_slots_{slot_allocator_, capacity_, stream_}, - d_counter_{counter_allocator_.allocate(1, stream_), delete_counter_}, - slots_{slot_allocator_.allocate(capacity_, stream_), delete_slots_} + d_counter_{counter_allocator_.allocate(1), delete_counter_}, + slots_{slot_allocator_.allocate(capacity_), delete_slots_} { auto constexpr block_size = 128; auto constexpr stride = 4; diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 9261ef9a2..b4a5f7626 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -536,7 +536,7 @@ class static_multimap { counter_deleter(counter_deleter const&) = default; - void operator()(atomic_ctr_type* ptr) { allocator.deallocate(ptr, 1, stream); } + void operator()(atomic_ctr_type* ptr) { allocator.deallocate(ptr, 1); } counter_allocator_type& allocator; cudaStream_t& stream; @@ -553,7 +553,7 @@ class static_multimap { slot_deleter(slot_deleter const&) = default; - void operator()(pair_atomic_type* ptr) { allocator.deallocate(ptr, capacity, stream); } + void operator()(pair_atomic_type* ptr) { allocator.deallocate(ptr, capacity); } slot_allocator_type& allocator; size_t& capacity; From 719ffc5521857a2f04191db2bbb739826f15a4ba Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 15 Nov 2021 18:58:14 -0500 Subject: [PATCH 34/70] Remove stream member variable --- .../cuco/detail/static_multimap/static_multimap.inl | 7 +++---- include/cuco/static_multimap.cuh | 10 ++-------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 1bdfd091b..0e8123f53 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -51,9 +51,8 @@ static_multimap::static_multimap( empty_value_sentinel_{empty_value_sentinel}, counter_allocator_{alloc}, slot_allocator_{alloc}, - stream_{stream}, - delete_counter_{counter_allocator_, stream_}, - delete_slots_{slot_allocator_, capacity_, stream_}, + delete_counter_{counter_allocator_}, + delete_slots_{slot_allocator_, capacity_}, d_counter_{counter_allocator_.allocate(1), delete_counter_}, slots_{slot_allocator_.allocate(capacity_), delete_slots_} { @@ -61,7 +60,7 @@ static_multimap::static_multimap( auto constexpr stride = 4; auto const grid_size = (get_capacity() + stride * block_size - 1) / (stride * block_size); - detail::initialize<<>>( + detail::initialize<<>>( slots_.get(), empty_key_sentinel, empty_value_sentinel, get_capacity()); } diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index b4a5f7626..16b3ecd21 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -532,24 +532,20 @@ class static_multimap { * @brief Custom deleter for unique pointer of device counter. */ struct counter_deleter { - counter_deleter(counter_allocator_type& a, cudaStream_t& s) : allocator{a}, stream{s} {} + counter_deleter(counter_allocator_type& a) : allocator{a} {} counter_deleter(counter_deleter const&) = default; void operator()(atomic_ctr_type* ptr) { allocator.deallocate(ptr, 1); } counter_allocator_type& allocator; - cudaStream_t& stream; }; /** * @brief Custom deleter for unique pointer of slots. */ struct slot_deleter { - slot_deleter(slot_allocator_type& a, size_t& c, cudaStream_t& s) - : allocator{a}, capacity{c}, stream{s} - { - } + slot_deleter(slot_allocator_type& a, size_t& c) : allocator{a}, capacity{c} {} slot_deleter(slot_deleter const&) = default; @@ -557,7 +553,6 @@ class static_multimap { slot_allocator_type& allocator; size_t& capacity; - cudaStream_t& stream; }; class device_view_impl_base; @@ -1146,7 +1141,6 @@ class static_multimap { Value empty_value_sentinel_{}; ///< Initial value of empty slot slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate counters - cudaStream_t stream_{}; ///< CUDA stream used for ctor/dtor counter_deleter delete_counter_; ///< Custom counter deleter slot_deleter delete_slots_; ///< Custom slots deleter std::unique_ptr d_counter_{}; ///< Preallocated device counter From 8891da5475f95fd3f53fd7dc75bc1bfba4372ebb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 16 Nov 2021 16:40:33 -0800 Subject: [PATCH 35/70] Change insert to insert_if_n. --- include/cuco/detail/static_map.inl | 4 ++-- include/cuco/detail/static_map_kernels.cuh | 10 ++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 8c6bb12e0..e1c2d0a21 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -100,8 +100,8 @@ void static_map::insert_if(InputIt first, std::size_t h_num_successes; // TODO: Should I specialize the version with a tile size? - detail::insert_if - <<>>(first, first + num_keys, num_successes_, view, stencil, pred, hash, key_equal); + detail::insert_if_n + <<>>(first, num_keys, num_successes_, view, stencil, pred, hash, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); CUCO_CUDA_TRY(cudaDeviceSynchronize()); diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index dc80cb4c1..018dba4d9 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -184,23 +184,21 @@ template -__global__ void insert_if( - InputIt first, InputIt last, atomicT* num_successes, viewT view, StencilIt stencil, Predicate pred, Hash hash, KeyEqual key_equal) +__global__ void insert_if_n( + InputIt first, std::size_t n, atomicT* num_successes, viewT view, StencilIt stencil, Predicate pred, Hash hash, KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid; auto i = tid; - while (it < last) { + while (i < n) { if (pred(*(stencil + i))) { - typename viewT::value_type const insert_pair{*it}; + typename viewT::value_type const insert_pair{*(first + i)}; if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; } } - it += gridDim.x * block_size; i += gridDim.x * block_size; } From c31ee58983033567162075846c17dafd1ac4eab8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 16 Nov 2021 17:14:59 -0800 Subject: [PATCH 36/70] Address most PR comments. --- include/cuco/detail/static_map.inl | 17 +++++++++-------- include/cuco/static_map.cuh | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index e1c2d0a21..c632c1052 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -83,28 +83,29 @@ void static_map::insert_if(InputIt first, StencilIt stencil, Predicate pred, Hash hash, - KeyEqual key_equal) + KeyEqual key_equal, + cudaStream_t stream) { auto num_keys = std::distance(first, last); if (num_keys == 0) { return; } - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); auto view = get_device_mutable_view(); // TODO: memset an atomic variable is unsafe static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); - CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type))); + CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream)); std::size_t h_num_successes; // TODO: Should I specialize the version with a tile size? detail::insert_if_n - <<>>(first, num_keys, num_successes_, view, stencil, pred, hash, key_equal); + <<>>(first, num_keys, num_successes_, view, stencil, pred, hash, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( - &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); + &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); size_ += h_num_successes; } diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index cbbdfa3aa..2c79d4868 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -248,7 +248,7 @@ class static_map { typename Hash = cuco::detail::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); + InputIt first, InputIt last, StencilIt stencil, Predicate pred, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, cudaStream_t stream = 0); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. From 2bb6c017dae3cdad9ae1fbe1d9dd4eff42e90118 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 17 Nov 2021 09:30:59 -0800 Subject: [PATCH 37/70] Switch to using CG device API. --- include/cuco/detail/static_map.inl | 2 +- include/cuco/detail/static_map_kernels.cuh | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index c632c1052..7d844917c 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -101,7 +101,7 @@ void static_map::insert_if(InputIt first, std::size_t h_num_successes; // TODO: Should I specialize the version with a tile size? - detail::insert_if_n + detail::insert_if_n <<>>(first, num_keys, num_successes_, view, stencil, pred, hash, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 018dba4d9..d1202a243 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -177,6 +177,7 @@ __global__ void insert( * @param key_equal The binary function used to compare two keys for equality */ template (cg::this_thread_block()); auto tid = block_size * blockIdx.x + threadIdx.x; - auto i = tid; + auto i = tid / tile_size; while (i < n) { if (pred(*(stencil + i))) { typename viewT::value_type const insert_pair{*(first + i)}; - if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; } + if (view.insert(tile, insert_pair, hash, key_equal)) { thread_num_successes++; } } i += gridDim.x * block_size; } From 5da364f9eec0b02d2a1a58981e09a33484e3b255 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 17 Nov 2021 12:05:42 -0800 Subject: [PATCH 38/70] Apply suggestions from code review Co-authored-by: Yunsong Wang --- include/cuco/detail/static_map_kernels.cuh | 20 +++++++++++++++----- include/cuco/static_map.cuh | 2 ++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index d1202a243..e02523b1c 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -157,22 +157,30 @@ __global__ void insert( } /** - * @brief Inserts all key/value pairs in the range `[first, last)`. + * @brief Inserts key/value pairs in the range `[first, first + n)` if `pred` of the + * corresponding stencil returns true. * * If multiple keys in `[first, last)` compare equal, it is unspecified which * element is inserted. * - * @tparam block_size + * @tparam block_size The size of the thread block + * @tparam tile_size The number of threads in the Cooperative Groups used to perform insert * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `value_type` * @tparam atomicT Type of atomic storage * @tparam viewT Type of device view allowing access of hash map storage + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` + * and argument type is convertible from `std::iterator_traits::value_type` * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs + * @param n Number of elements to insert * @param num_successes The number of successfully inserted key/value pairs * @param view Mutable device view used to access the hash map's slot storage + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[s, s + n)` * @param hash The unary function to apply to hash each key * @param key_equal The binary function used to compare two keys for equality */ @@ -201,13 +209,15 @@ __global__ void insert_if_n( typename viewT::value_type const insert_pair{*(first + i)}; if (view.insert(tile, insert_pair, hash, key_equal)) { thread_num_successes++; } } - i += gridDim.x * block_size; + i += (gridDim.x * block_size) / tile_size; } // compute number of successfully inserted elements for each block // and atomically add to the grand total std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { *num_successes += block_num_successes; } + if (threadIdx.x == 0) { + num_matches->fetch_add(block_num_matches, cuda::std::memory_order_relaxed); + } } /** diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 2c79d4868..5c6367f44 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -240,6 +240,8 @@ class static_map { * @param stencil Beginning of the stencil sequence * @param pred Predicate to test on every element in the range `[stencil, stencil + * std::distance(first, last))` + * @param hash The unary function to hash each key + * @param key_equal The binary function to compare two keys for equality * @param stream CUDA stream used for insert */ template Date: Wed, 17 Nov 2021 12:08:53 -0800 Subject: [PATCH 39/70] Apply clang-format. --- include/cuco/detail/static_map.inl | 14 +++++++++----- include/cuco/detail/static_map_kernels.cuh | 18 ++++++++++++------ include/cuco/static_map.cuh | 9 +++++++-- tests/static_map/static_map_test.cu | 18 ++++++++++++++---- 4 files changed, 42 insertions(+), 17 deletions(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 7d844917c..c236973e1 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -77,7 +77,11 @@ void static_map::insert(InputIt first, } template -template +template void static_map::insert_if(InputIt first, InputIt last, StencilIt stencil, @@ -92,8 +96,8 @@ void static_map::insert_if(InputIt first, auto constexpr block_size = 128; auto constexpr stride = 1; auto constexpr tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - auto view = get_device_mutable_view(); + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto view = get_device_mutable_view(); // TODO: memset an atomic variable is unsafe static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); @@ -101,8 +105,8 @@ void static_map::insert_if(InputIt first, std::size_t h_num_successes; // TODO: Should I specialize the version with a tile size? - detail::insert_if_n - <<>>(first, num_keys, num_successes_, view, stencil, pred, hash, key_equal); + detail::insert_if_n<<>>( + first, num_keys, num_successes_, view, stencil, pred, hash, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index e02523b1c..6a5fa94dd 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -193,16 +193,22 @@ template -__global__ void insert_if_n( - InputIt first, std::size_t n, atomicT* num_successes, viewT view, StencilIt stencil, Predicate pred, Hash hash, KeyEqual key_equal) +__global__ void insert_if_n(InputIt first, + std::size_t n, + atomicT* num_successes, + viewT view, + StencilIt stencil, + Predicate pred, + Hash hash, + KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto i = tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = block_size * blockIdx.x + threadIdx.x; + auto i = tid / tile_size; while (i < n) { if (pred(*(stencil + i))) { @@ -216,7 +222,7 @@ __global__ void insert_if_n( // and atomically add to the grand total std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); if (threadIdx.x == 0) { - num_matches->fetch_add(block_num_matches, cuda::std::memory_order_relaxed); + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); } } diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 5c6367f44..e7661347d 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -249,8 +249,13 @@ class static_map { typename Predicate, typename Hash = cuco::detail::MurmurHash3_32, typename KeyEqual = thrust::equal_to> - void insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, cudaStream_t stream = 0); + void insert_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = 0); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index db3bc4b3e..89e4b3ede 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -181,16 +181,26 @@ TEST_CASE("User defined key and value type", "") SECTION("All conditionally inserted keys-value pairs should be contained") { thrust::device_vector contained(num_pairs); - map.insert_if(insert_pairs, insert_pairs + num_pairs, thrust::counting_iterator(0), - [] __device__(auto const& key) { return (key % 2) == 0; }, hash_key_pair{}, key_pair_equals{}); + map.insert_if( + insert_pairs, + insert_pairs + num_pairs, + thrust::counting_iterator(0), + [] __device__(auto const& key) { return (key % 2) == 0; }, + hash_key_pair{}, + key_pair_equals{}); map.contains(insert_keys.begin(), insert_keys.end(), contained.begin(), hash_key_pair{}, key_pair_equals{}); - REQUIRE(thrust::equal(thrust::device, contained.begin(), contained.end(), thrust::counting_iterator(0), - [] __device__(auto const& idx_contained, auto const& idx) { return ((idx % 2) == 0) == idx_contained; })); + REQUIRE(thrust::equal(thrust::device, + contained.begin(), + contained.end(), + thrust::counting_iterator(0), + [] __device__(auto const& idx_contained, auto const& idx) { + return ((idx % 2) == 0) == idx_contained; + })); } SECTION("Non-inserted keys-value pairs should not be contained") From 8409283f1d7635811478abc0139faf96b6598d1e Mon Sep 17 00:00:00 2001 From: Chirayu Date: Wed, 17 Nov 2021 12:11:26 -0800 Subject: [PATCH 40/70] Address review comments, actually commit unit test file --- examples/static_map/static_map_example.cu | 7 ++- include/cuco/detail/static_map.inl | 27 +++----- include/cuco/static_map.cuh | 15 ++--- tests/static_map/static_map_test.cu | 75 ++++++++++++++++++++++- 4 files changed, 94 insertions(+), 30 deletions(-) diff --git a/examples/static_map/static_map_example.cu b/examples/static_map/static_map_example.cu index 42c8080de..743dc021a 100644 --- a/examples/static_map/static_map_example.cu +++ b/examples/static_map/static_map_example.cu @@ -43,8 +43,8 @@ int main(void) [] __device__(auto i) { return thrust::make_pair(i, i); }); // Inserts all pairs into the map - map.insert(pairs.begin(), pairs.end()); - cudaStreamSynchronize(str); + map.insert(pairs.begin(), pairs.end(), cuco::detail::MurmurHash3_32{}, + thrust::equal_to{}, str); // Sequence of keys {0, 1, 2, ...} thrust::device_vector keys_to_find(50'000); @@ -53,7 +53,8 @@ int main(void) // Finds all keys {0, 1, 2, ...} and stores associated values into `found_values` // If a key `keys_to_find[i]` doesn't exist, `found_values[i] == empty_value_sentinel` - map.find(keys_to_find.begin(), keys_to_find.end(), found_values.begin()); + map.find(keys_to_find.begin(), keys_to_find.end(), found_values.begin(), + cuco::detail::MurmurHash3_32{}, thrust::equal_to{}, str); cudaStreamSynchronize(str); return 0; diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 0821e31ee..483a5b2d7 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -28,27 +28,24 @@ static_map::static_map(std::size_t capacity, empty_key_sentinel_{empty_key_sentinel}, empty_value_sentinel_{empty_value_sentinel}, slot_allocator_{alloc}, - counter_allocator_{alloc}, - exec_stream_{stream} + counter_allocator_{alloc} { - // allocator should allocate memory accessible by the exec_stream_ - slots_ = slot_allocator_.allocate(capacity_, exec_stream_); - num_successes_ = counter_allocator_.allocate(1, exec_stream_); + slots_ = std::allocator_traits::allocate(slot_allocator_, capacity_); + num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); auto constexpr block_size = 256; auto constexpr stride = 4; auto const grid_size = (capacity_ + stride * block_size - 1) / (stride * block_size); detail::initialize - <<>>(slots_, empty_key_sentinel, empty_value_sentinel, + <<>>(slots_, empty_key_sentinel, empty_value_sentinel, capacity_); } template static_map::~static_map() { - // use exec_stream_ parameter param - slot_allocator_.deallocate(slots_, capacity_, exec_stream_); - counter_allocator_.deallocate(num_successes_, 1, exec_stream_); + std::allocator_traits::deallocate(slot_allocator_, slots_, capacity_); + std::allocator_traits::deallocate(counter_allocator_, num_successes_, 1); } template @@ -79,9 +76,8 @@ void static_map::insert(InputIt first, CUCO_CUDA_TRY(cudaMemcpyAsync( &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); - // stream'd execution assumes sync not required - if (stream == 0) - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // ensures legacy behavior + + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated size_ += h_num_successes; } @@ -103,9 +99,7 @@ void static_map::find( detail::find <<>>(first, last, output_begin, view, hash, key_equal); - // stream'd execution assumes sync not required - if (stream == 0) - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // ensures legacy behavior + } template @@ -125,9 +119,6 @@ void static_map::contains( detail::contains <<>>(first, last, output_begin, view, hash, key_equal); - // stream'd execution assumes sync not required - if (stream == 0) - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // ensures legacy behavior } template diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 3e61e797a..18eb23fed 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -204,7 +204,7 @@ class static_map { ~static_map(); /** - * @brief Inserts all key/value pairs in the range `[first, last)`. + * @brief Inserts all key/value pairs in the range `[first, last)`. This function is synchronous. * * If multiple keys in `[first, last)` compare equal, it is unspecified which * element is inserted. @@ -217,7 +217,7 @@ class static_map { * @param last End of the sequence of key/value pairs * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality - * @param stream Stream used for executing the kernels, blocking execution for NULL stream + * @param stream Stream used for executing the kernels */ template , @@ -226,7 +226,8 @@ class static_map { cudaStream_t stream = 0); /** - * @brief Finds the values corresponding to all keys in the range `[first, last)`. + * @brief Asynchronous function, finds the values corresponding to all keys + * in the range `[first, last)`. * * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + * i)`. Else, copies the empty value sentinel. @@ -242,7 +243,7 @@ class static_map { * @param output_begin Beginning of the sequence of values retrieved for each key * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality - * @param stream Stream used for executing the kernels, blocking execution for NULL stream + * @param stream Stream used for executing the kernels */ template -bool all_of(Iterator begin, Iterator end, Predicate p) +bool all_of(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0) { auto size = thrust::distance(begin, end); - return size == thrust::count_if(begin, end, p); + auto out = thrust::count_if(thrust::cuda::par.on(stream), begin, end, p); + cudaStreamSynchronize(stream); + return size == out; } template @@ -419,6 +421,75 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", } } +TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", + "", + ((typename T, dist_type Dist), T, Dist), + (int32_t, dist_type::UNIQUE), + (int64_t, dist_type::UNIFORM), + (int32_t, dist_type::GAUSSIAN)) +{ + using Key = T; + using Value = T; + + cudaStream_t stream; + cudaStreamCreate(&stream); + + constexpr std::size_t num_keys{500'000}; + cuco::static_map map{1'000'000, -1, -1, + cuco::cuda_allocator{}, stream}; + + auto m_view = map.get_device_mutable_view(); + auto view = map.get_device_view(); + + std::vector h_keys(num_keys); + std::vector h_values(num_keys); + std::vector> h_pairs(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; + h_pairs[i].second = val; + h_values[i] = val; + } + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_values); + thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector d_results(num_keys); + thrust::device_vector d_contained(num_keys); + + auto hash_fn = cuco::detail::MurmurHash3_32{}; + auto equal_fn = thrust::equal_to{}; + + // bulk function test cases + SECTION("All inserted keys-value pairs should be correctly recovered during find") + { + map.insert(d_pairs.begin(), d_pairs.end(), hash_fn, equal_fn, stream); + map.find(d_keys.begin(), d_keys.end(), d_results.begin(), hash_fn, equal_fn, stream); + //cudaStreamSynchronize(stream); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); + + REQUIRE(all_of(zip, zip + num_keys, [] __device__(auto const& p) { + return thrust::get<0>(p) == thrust::get<1>(p); + }, stream)); + } + SECTION("All inserted keys-value pairs should be contained") + { + map.insert(d_pairs.begin(), d_pairs.end(), hash_fn, equal_fn, stream); + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin(), hash_fn, + equal_fn, stream); + + REQUIRE( + all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; }, + stream)); + } + + cudaStreamDestroy(stream); +} + template __global__ void shared_memory_test_kernel( typename MapType::device_view const* const device_views, From 1af02fa642181f29b181fee076305746d3cc311c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 17 Nov 2021 12:19:36 -0800 Subject: [PATCH 41/70] Remove unnecessary comment. --- include/cuco/detail/static_map.inl | 1 - 1 file changed, 1 deletion(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index c236973e1..aa52d69ee 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -104,7 +104,6 @@ void static_map::insert_if(InputIt first, CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream)); std::size_t h_num_successes; - // TODO: Should I specialize the version with a tile size? detail::insert_if_n<<>>( first, num_keys, num_successes_, view, stencil, pred, hash, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( From ca68548dfbca341d94b23da98e578e2f402e4971 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 3 Dec 2021 10:23:06 -0500 Subject: [PATCH 42/70] Resolve conflicts --- examples/static_map/custom_type_example.cu | 4 ++-- tests/static_map/static_map_test.cu | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/static_map/custom_type_example.cu b/examples/static_map/custom_type_example.cu index 33b30e578..443323e36 100644 --- a/examples/static_map/custom_type_example.cu +++ b/examples/static_map/custom_type_example.cu @@ -83,7 +83,7 @@ int main(void) auto const empty_key_sentinel = custom_key_type{-1}; auto const empty_value_sentinel = custom_value_type{-1}; - // Create a sequence of 80'000 pairs + // Create an iterator of input key/value pairs auto pairs_begin = thrust::make_transform_iterator( thrust::make_counting_iterator(0), [] __device__(auto i) { return cuco::make_pair(custom_key_type{i}, custom_value_type{i}); }); @@ -93,7 +93,7 @@ int main(void) cuco::static_map map{ 100'000, empty_key_sentinel, empty_value_sentinel}; - // Inserts all pairs into the map by using the custom hasher and custom equality callable + // Inserts 80,000 pairs into the map by using the custom hasher and custom equality callable map.insert(pairs_begin, pairs_begin + num_pairs, custom_hash{}, custom_key_equals{}); // Reproduce inserted keys diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index c7879cef6..6f757ccb1 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -225,19 +225,19 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", SECTION("All conditionally inserted keys-value pairs should be contained") { - thrust::device_vector contained(num_pairs); + thrust::device_vector contained(num); map.insert_if( insert_pairs, - insert_pairs + num_pairs, + insert_pairs + num, thrust::counting_iterator(0), [] __device__(auto const& key) { return (key % 2) == 0; }, - hash_key_pair{}, - key_pair_equals{}); + hash_custom_key{}, + custom_key_equals{}); map.contains(insert_keys.begin(), insert_keys.end(), contained.begin(), - hash_key_pair{}, - key_pair_equals{}); + hash_custom_key{}, + custom_key_equals{}); REQUIRE(thrust::equal(thrust::device, contained.begin(), From d4849f37e7ca9ce99ad6f064b6b5f71051736eac Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 3 Dec 2021 16:37:46 -0500 Subject: [PATCH 43/70] Minor CMake style fixes --- examples/CMakeLists.txt | 2 +- tests/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4b87d7150..937b8c8f4 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -24,6 +24,6 @@ ConfigureExample(STATIC_MULTIMAP_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_mul foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) if("${arch}" MATCHES "60") target_compile_definitions(CUSTOM_TYPE_EXAMPLE PRIVATE CUCO_NO_INDEPENDENT_THREADS) - break() + break() endif() endforeach() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4f02b69de..aa41d6b48 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -43,7 +43,7 @@ ConfigureTest(STATIC_MAP_TEST "${STATIC_MAP_TEST_SRC}") foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) if("${arch}" MATCHES "60") target_compile_definitions(STATIC_MAP_TEST PRIVATE CUCO_NO_INDEPENDENT_THREADS) - break() + break() endif() endforeach() #################################################################################################### From 1e10b7d497d9f03de915577bba130f9b693fb247 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Mon, 6 Dec 2021 11:42:41 -0600 Subject: [PATCH 44/70] Doc updates. --- include/cuco/static_map.cuh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 3ca392a48..d46a6e139 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -189,7 +189,7 @@ class static_map { * @param empty_key_sentinel The reserved key value for empty slots * @param empty_value_sentinel The reserved mapped value for empty slots * @param alloc Allocator used for allocating device storage - * @param stream Stream used for executing the kernels, blocking execution for NULL stream + * @param stream Stream used for executing the kernels */ static_map(std::size_t capacity, Key empty_key_sentinel, @@ -204,7 +204,9 @@ class static_map { ~static_map(); /** - * @brief Inserts all key/value pairs in the range `[first, last)`. This function is synchronous. + * @brief Inserts all key/value pairs in the range `[first, last)`. + * + * This function synchronizes `stream`. * * If multiple keys in `[first, last)` compare equal, it is unspecified which * element is inserted. @@ -292,7 +294,7 @@ class static_map { cudaStream_t stream = 0); /** - * @brief Asynchronous function, indicates whether the keys in the range + * @brief Indicates whether the keys in the range * `[first, last)` are contained in the map. * * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. From c8f042f4a9fc4cdca6808a7d267c50a88bc729fa Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 6 Dec 2021 17:35:49 -0500 Subject: [PATCH 45/70] Add non-shared-memory pair retrieve APIs --- include/cuco/static_multimap.cuh | 57 ++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 16b3ecd21..d6044f549 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -959,6 +959,34 @@ class static_multimap { OutputIt output_begin, KeyEqual key_equal = KeyEqual{}) noexcept; + /** + * @brief Retrieves all the matches of a given pair contained in multimap without using shared + * memory buffer + * + * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations + * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in + * `[contained_output_begin, contained_output_end)`. + * + * @tparam atomicT Type of atomic storage + * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from + * `InputIt`s `value_type`. + * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from + * the map's `value_type`. + * @tparam PairEqual Binary callable type + * @param pair The pair to search for + * @param num_matches Size of the output sequence + * @param probe_output_begin Beginning of the output sequence of the matched probe pairs + * @param contained_output_begin Beginning of the output sequence of the matched contained + * pairs + * @param pair_equal The binary callable used to compare two pairs for equality + */ + template + __device__ __forceinline__ void pair_retrieve(value_type const& pair, + atomicT* num_matches, + OutputIt1 probe_output_begin, + OutputIt2 contained_output_begin, + PairEqual pair_equal) noexcept; + /** * @brief Retrieves all the matches of a given pair contained in multimap with per-flushing-CG * shared memory buffer. @@ -1005,6 +1033,35 @@ class static_multimap { OutputIt2 contained_output_begin, PairEqual pair_equal) noexcept; + /** + * @brief Retrieves all the matches of a given pair contained in multimap without using shared + * memory buffer + * + * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations + * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in + * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies + * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output. + * + * @tparam atomicT Type of atomic storage + * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from + * `InputIt`s `value_type`. + * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from + * the map's `value_type`. + * @tparam PairEqual Binary callable type + * @param pair The pair to search for + * @param num_matches Size of the output sequence + * @param probe_output_begin Beginning of the output sequence of the matched probe pairs + * @param contained_output_begin Beginning of the output sequence of the matched contained + * pairs + * @param pair_equal The binary callable used to compare two pairs for equality + */ + template + __device__ __forceinline__ void pair_retrieve_outer(value_type const& pair, + atomicT* num_matches, + OutputIt1 probe_output_begin, + OutputIt2 contained_output_begin, + PairEqual pair_equal) noexcept; + /** * @brief Retrieves all the matches of a given pair contained in multimap with per-flushing-CG * shared memory buffer. From 147634e234edf5814932083db3952fe035a63d42 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 6 Dec 2021 17:41:15 -0500 Subject: [PATCH 46/70] Add probing CG argument in non-shared-memory retrieve APIs --- include/cuco/static_multimap.cuh | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index d6044f549..02afb9b1b 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -973,6 +973,7 @@ class static_multimap { * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from * the map's `value_type`. * @tparam PairEqual Binary callable type + * @param probing_cg The Cooperative Group used to retrieve * @param pair The pair to search for * @param num_matches Size of the output sequence * @param probe_output_begin Beginning of the output sequence of the matched probe pairs @@ -981,11 +982,13 @@ class static_multimap { * @param pair_equal The binary callable used to compare two pairs for equality */ template - __device__ __forceinline__ void pair_retrieve(value_type const& pair, - atomicT* num_matches, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - PairEqual pair_equal) noexcept; + __device__ __forceinline__ void pair_retrieve( + cooperative_groups::thread_block_tile const& probing_cg, + value_type const& pair, + atomicT* num_matches, + OutputIt1 probe_output_begin, + OutputIt2 contained_output_begin, + PairEqual pair_equal) noexcept; /** * @brief Retrieves all the matches of a given pair contained in multimap with per-flushing-CG @@ -1048,6 +1051,7 @@ class static_multimap { * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from * the map's `value_type`. * @tparam PairEqual Binary callable type + * @param probing_cg The Cooperative Group used to retrieve * @param pair The pair to search for * @param num_matches Size of the output sequence * @param probe_output_begin Beginning of the output sequence of the matched probe pairs @@ -1056,11 +1060,13 @@ class static_multimap { * @param pair_equal The binary callable used to compare two pairs for equality */ template - __device__ __forceinline__ void pair_retrieve_outer(value_type const& pair, - atomicT* num_matches, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - PairEqual pair_equal) noexcept; + __device__ __forceinline__ void pair_retrieve_outer( + cooperative_groups::thread_block_tile const& probing_cg, + value_type const& pair, + atomicT* num_matches, + OutputIt1 probe_output_begin, + OutputIt2 contained_output_begin, + PairEqual pair_equal) noexcept; /** * @brief Retrieves all the matches of a given pair contained in multimap with per-flushing-CG From 6c44a883b37d8f9dc12472a2c91c08f0ba95f9e3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 6 Dec 2021 18:47:27 -0500 Subject: [PATCH 47/70] Add non-shared-memory pair_retrieve kernel --- .../cuco/detail/static_multimap/kernels.cuh | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh index 83aa2c640..6d82c0972 100644 --- a/include/cuco/detail/static_multimap/kernels.cuh +++ b/include/cuco/detail/static_multimap/kernels.cuh @@ -412,6 +412,74 @@ __global__ void retrieve(InputIt first, } } +/** + * @brief Retrieves all pairs matching the input probe pair in the range `[first, last)`. + * + * If pair_equal(*(first + i), slot[j]) returns true, then *(first+i) is stored to unspecified + * locations in `probe_output_begin`, and slot[j] is stored to unspecified locations in + * `contained_output_begin`. If the given pair has no matches in the map, copies *(first + i) in + * `probe_output_begin` and a pair of `empty_key_sentinel` and `empty_value_sentinel` in + * `contained_output_begin` only when `is_outer` is `true`. + * + * Behavior is undefined if the total number of matching pairs exceeds `std::distance(output_begin, + * output_begin + *num_matches - 1)`. Use `pair_count()` to determine the size of the output range. + * + * @tparam block_size The size of the thread block + * @tparam probing_cg_size The size of the CG for parallel retrievals + * @tparam buffer_size Size of the output buffer + * @tparam is_outer Boolean flag indicating whether non-matches are included in the output + * @tparam InputIt Device accessible random access input iterator where + * `std::is_convertible::value_type, + * static_multimap::value_type>` is `true` + * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from + * `InputIt`s `value_type`. + * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from + * the map's `value_type`. + * @tparam atomicT Type of atomic storage + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam PairEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param probe_output_begin Beginning of the sequence of the matched probe pairs + * @param contained_output_begin Beginning of the sequence of the matched contained pairs + * @param num_matches Size of the output sequence + * @param view Device view used to access the hash map's slot storage + * @param pair_equal The binary function to compare two pairs for equality + */ +template +__global__ void pair_retrieve(InputIt first, + InputIt last, + OutputIt1 probe_output_begin, + OutputIt2 contained_output_begin, + atomicT* num_matches, + viewT view, + PairEqual pair_equal) +{ + auto probing_cg = cg::tiled_partition(cg::this_thread_block()); + auto tid = block_size * blockIdx.x + threadIdx.x; + auto pair_idx = tid / probing_cg_size; + + while (first + pair_idx < last) { + pair_type pair = *(first + pair_idx); + if constexpr (is_outer) { + view.pair_retrieve_outer( + probing_cg, pair, num_matches, probe_output_begin, contained_output_begin, pair_equal); + } else { + view.pair_retrieve( + probing_cg, pair, num_matches, probe_output_begin, contained_output_begin, pair_equal); + } + pair_idx += (gridDim.x * block_size) / probing_cg_size; + } +} + /** * @brief Retrieves all pairs matching the input probe pair in the range `[first, last)`. * From 2d5d44bc2c0569a9627932034f3bea718299fe0f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 6 Dec 2021 20:08:54 -0500 Subject: [PATCH 48/70] Add non-shared-memory pair retrieve device impl functions --- .../static_multimap/device_view_impl.inl | 176 ++++++++++++++++++ 1 file changed, 176 insertions(+) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index c4e520446..eb9e4d624 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1055,6 +1055,182 @@ class static_multimap::device_view_ } // while running } + /** + * @brief Retrieves all the matches of a given pair contained in multimap using vector + * loads without shared memory buffer. + * + * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations + * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in + * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies + * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output only if + * `is_outer` is true. + * + * @tparam is_outer Boolean flag indicating whether outer join is peformed + * @tparam uses_vector_load Boolean flag indicating whether vector loads are used + * @tparam ProbingCG Type of Cooperative Group used to retrieve + * @tparam atomicT Type of atomic storage + * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from + * `InputIt`s `value_type`. + * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from + * the map's `value_type`. + * @tparam PairEqual Binary callable type + * @param probing_cg The Cooperative Group used to retrieve + * @param pair The pair to search for + * @param num_matches Size of the output sequence + * @param probe_output_begin Beginning of the output sequence of the matched probe pairs + * @param contained_output_begin Beginning of the output sequence of the matched contained + * pairs + * @param pair_equal The binary callable used to compare two pairs for equality + */ + template + __device__ __forceinline__ std::enable_if_t pair_retrieve( + ProbingCG const& probing_cg, + value_type const& pair, + atomicT* num_matches, + OutputIt1 probe_output_begin, + OutputIt2 contained_output_begin, + PairEqual pair_equal) noexcept + { + auto current_slot = initial_slot(probing_cg, pair.first); + [[maybe_unused]] auto found_match = false; + + while (true) { + value_type arr[2]; + load_pair_array(&arr[0], current_slot); + + auto const first_slot_is_empty = + detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel()); + auto const second_slot_is_empty = + detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel()); + auto const first_equals = (not first_slot_is_empty and pair_equal(arr[0], pair)); + auto const second_equals = (not second_slot_is_empty and pair_equal(arr[1], pair)); + + if constexpr (is_outer) { + auto const exists = probing_cg.any(first_equals or second_equals); + if (exists) { found_match = true; } + } + + using cuda::std::memory_order_relaxed; + + if (first_equals) { + auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + *(probe_output_begin + output_idx) = pair; + *(contained_output_begin + output_idx) = arr[0]; + } + if (second_equals) { + auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + *(probe_output_begin + output_idx) = pair; + *(contained_output_begin + output_idx) = arr[1]; + } + + if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) { + if constexpr (is_outer) { + if ((not found_match) and probing_cg.thread_rank() == 0) { + auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + *(probe_output_begin + output_idx) = pair; + *(contained_output_begin + output_idx) = + cuco::make_pair(std::move(this->get_empty_key_sentinel()), + std::move(this->get_empty_value_sentinel())); + } + } + return; // exit if any slot in the window is empty + } + + current_slot = next_slot(current_slot); + } // while + } + + /** + * @brief Retrieves all the matches of a given pair contained in multimap using scalar + * loads without shared memory buffer. + * + * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations + * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in + * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies + * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output only if + * `is_outer` is true. + * + * @tparam is_outer Boolean flag indicating whether outer join is peformed + * @tparam uses_vector_load Boolean flag indicating whether vector loads are used + * @tparam ProbingCG Type of Cooperative Group used to retrieve + * @tparam atomicT Type of atomic storage + * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from + * `InputIt`s `value_type`. + * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from + * the map's `value_type`. + * @tparam PairEqual Binary callable type + * @param probing_cg The Cooperative Group used to retrieve + * @param pair The pair to search for + * @param num_matches Size of the output sequence + * @param probe_output_begin Beginning of the output sequence of the matched probe pairs + * @param contained_output_begin Beginning of the output sequence of the matched contained + * pairs + * @param pair_equal The binary callable used to compare two pairs for equality + */ + template + __device__ __forceinline__ std::enable_if_t pair_retrieve( + ProbingCG const& probing_cg, + value_type const& pair, + atomicT* num_matches, + OutputIt1 probe_output_begin, + OutputIt2 contained_output_begin, + PairEqual pair_equal) noexcept + { + auto current_slot = initial_slot(probing_cg, pair.first); + [[maybe_unused]] auto found_match = false; + + while (true) { + // TODO: Replace reinterpret_cast with atomic ref when possible. The current implementation + // is unsafe! + static_assert(sizeof(Key) == sizeof(cuda::atomic)); + static_assert(sizeof(Value) == sizeof(cuda::atomic)); + value_type slot_contents = *reinterpret_cast(current_slot); + + auto const slot_is_empty = + detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel()); + auto const equals = (not slot_is_empty and pair_equal(slot_contents, pair)); + + if constexpr (is_outer) { + auto const exists = probing_cg.any(equals); + if (exists) { found_match = true; } + } + + using cuda::std::memory_order_relaxed; + + if (equals) { + auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + *(probe_output_begin + output_idx) = pair; + *(contained_output_begin + output_idx) = arr[0]; + } + + if (probing_cg.any(slot_is_empty)) { + if constexpr (is_outer) { + if ((not found_match) and probing_cg.thread_rank() == 0) { + auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + *(probe_output_begin + output_idx) = pair; + *(contained_output_begin + output_idx) = + cuco::make_pair(std::move(this->get_empty_key_sentinel()), + std::move(this->get_empty_value_sentinel())); + } + } + return; // exit if any slot in the window is empty + } + + current_slot = next_slot(current_slot); + } // while + } + /** * @brief Retrieves all the matches of a given pair contained in multimap using vector * loads with per-flushing-CG shared memory buffer. From 8946c11822c0ee7c97c1be8c9d4401ab07af4d4f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 6 Dec 2021 20:18:27 -0500 Subject: [PATCH 49/70] Fix typos --- include/cuco/detail/static_multimap/device_view_impl.inl | 5 +++-- include/cuco/detail/static_multimap/kernels.cuh | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index eb9e4d624..9b459bde0 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1083,7 +1083,8 @@ class static_multimap::device_view_ * @param pair_equal The binary callable used to compare two pairs for equality */ template ::device_view_ if (equals) { auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); *(probe_output_begin + output_idx) = pair; - *(contained_output_begin + output_idx) = arr[0]; + *(contained_output_begin + output_idx) = slot_contents; } if (probing_cg.any(slot_is_empty)) { diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh index 6d82c0972..73a1dae0a 100644 --- a/include/cuco/detail/static_multimap/kernels.cuh +++ b/include/cuco/detail/static_multimap/kernels.cuh @@ -463,6 +463,8 @@ __global__ void pair_retrieve(InputIt first, viewT view, PairEqual pair_equal) { + using pair_type = typename viewT::value_type; + auto probing_cg = cg::tiled_partition(cg::this_thread_block()); auto tid = block_size * blockIdx.x + threadIdx.x; auto pair_idx = tid / probing_cg_size; From 071030a40ac189aacbfa0de97eac971b79b68262 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 6 Dec 2021 20:19:17 -0500 Subject: [PATCH 50/70] Add public device functions --- .../static_multimap/static_multimap.inl | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 0e8123f53..722fccdae 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -686,6 +686,26 @@ static_multimap::device_view::retri } } +template +template +__device__ __forceinline__ void +static_multimap::device_view::pair_retrieve( + cooperative_groups::thread_block_tile const& probing_cg, + value_type const& pair, + atomicT* num_matches, + OutputIt1 probe_output_begin, + OutputIt2 contained_output_begin, + PairEqual pair_equal) noexcept +{ + constexpr bool is_outer = false; + impl_.pair_retrieve( + probing_cg, pair, num_matches, probe_output_begin, contained_output_begin, pair_equal); +} + template ::device_view::pair_ } } +template +template +__device__ __forceinline__ void +static_multimap::device_view::pair_retrieve_outer( + cooperative_groups::thread_block_tile const& probing_cg, + value_type const& pair, + atomicT* num_matches, + OutputIt1 probe_output_begin, + OutputIt2 contained_output_begin, + PairEqual pair_equal) noexcept +{ + constexpr bool is_outer = true; + impl_.pair_retrieve( + probing_cg, pair, num_matches, probe_output_begin, contained_output_begin, pair_equal); +} + template Date: Mon, 6 Dec 2021 20:43:39 -0500 Subject: [PATCH 51/70] Use a workaround for output assignment --- .../static_multimap/device_view_impl.inl | 65 +++++++++++++------ 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 9b459bde0..ce66a053d 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1119,24 +1119,40 @@ class static_multimap::device_view_ using cuda::std::memory_order_relaxed; if (first_equals) { - auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); - *(probe_output_begin + output_idx) = pair; - *(contained_output_begin + output_idx) = arr[0]; + auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + // TODO: `=` operator cannot work here + // *(probe_output_begin + output_idx) = pair; + // *(contained_output_begin + output_idx) = arr[0]; + thrust::get<0>(*(probe_output_begin + output_idx)) = pair.first; + thrust::get<1>(*(probe_output_begin + output_idx)) = pair.second; + thrust::get<0>(*(contained_output_begin + output_idx)) = arr[0].first; + thrust::get<1>(*(contained_output_begin + output_idx)) = arr[0].second; } if (second_equals) { - auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); - *(probe_output_begin + output_idx) = pair; - *(contained_output_begin + output_idx) = arr[1]; + auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + // TODO: `=` operator cannot work here + // *(probe_output_begin + output_idx) = pair; + // *(contained_output_begin + output_idx) = arr[1]; + thrust::get<0>(*(probe_output_begin + output_idx)) = pair.first; + thrust::get<1>(*(probe_output_begin + output_idx)) = pair.second; + thrust::get<0>(*(contained_output_begin + output_idx)) = arr[1].first; + thrust::get<1>(*(contained_output_begin + output_idx)) = arr[1].second; } if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) { if constexpr (is_outer) { if ((not found_match) and probing_cg.thread_rank() == 0) { - auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); - *(probe_output_begin + output_idx) = pair; - *(contained_output_begin + output_idx) = - cuco::make_pair(std::move(this->get_empty_key_sentinel()), - std::move(this->get_empty_value_sentinel())); + auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + // TODO: `=` operator cannot work here + // *(probe_output_begin + output_idx) = pair; + // *(contained_output_begin + output_idx) = + // cuco::make_pair(std::move(this->get_empty_key_sentinel()), + // std::move(this->get_empty_value_sentinel())); + thrust::get<0>(*(probe_output_begin + output_idx)) = pair.first; + thrust::get<1>(*(probe_output_begin + output_idx)) = pair.second; + thrust::get<0>(*(contained_output_begin + output_idx)) = this->get_empty_key_sentinel(); + thrust::get<1>(*(contained_output_begin + output_idx)) = + this->get_empty_value_sentinel(); } } return; // exit if any slot in the window is empty @@ -1210,19 +1226,30 @@ class static_multimap::device_view_ using cuda::std::memory_order_relaxed; if (equals) { - auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); - *(probe_output_begin + output_idx) = pair; - *(contained_output_begin + output_idx) = slot_contents; + auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + // TODO: `=` operator cannot work here + // *(probe_output_begin + output_idx) = pair; + // *(contained_output_begin + output_idx) = slot_contents; + thrust::get<0>(*(probe_output_begin + output_idx)) = pair.first; + thrust::get<1>(*(probe_output_begin + output_idx)) = pair.second; + thrust::get<0>(*(contained_output_begin + output_idx)) = slot_contents.first; + thrust::get<1>(*(contained_output_begin + output_idx)) = slot_contents.second; } if (probing_cg.any(slot_is_empty)) { if constexpr (is_outer) { if ((not found_match) and probing_cg.thread_rank() == 0) { - auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); - *(probe_output_begin + output_idx) = pair; - *(contained_output_begin + output_idx) = - cuco::make_pair(std::move(this->get_empty_key_sentinel()), - std::move(this->get_empty_value_sentinel())); + auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + // TODO: `=` operator cannot work here + // *(probe_output_begin + output_idx) = pair; + // *(contained_output_begin + output_idx) = + // cuco::make_pair(std::move(this->get_empty_key_sentinel()), + // std::move(this->get_empty_value_sentinel())); + thrust::get<0>(*(probe_output_begin + output_idx)) = pair.first; + thrust::get<1>(*(probe_output_begin + output_idx)) = pair.second; + thrust::get<0>(*(contained_output_begin + output_idx)) = this->get_empty_key_sentinel(); + thrust::get<1>(*(contained_output_begin + output_idx)) = + this->get_empty_value_sentinel(); } } return; // exit if any slot in the window is empty From 6f041d01a1dc0d860d3df8e93fe52ac15003fcf6 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 7 Dec 2021 09:20:02 -0500 Subject: [PATCH 52/70] Update doc --- include/cuco/detail/static_multimap/kernels.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh index 73a1dae0a..c737e0f05 100644 --- a/include/cuco/detail/static_multimap/kernels.cuh +++ b/include/cuco/detail/static_multimap/kernels.cuh @@ -426,7 +426,6 @@ __global__ void retrieve(InputIt first, * * @tparam block_size The size of the thread block * @tparam probing_cg_size The size of the CG for parallel retrievals - * @tparam buffer_size Size of the output buffer * @tparam is_outer Boolean flag indicating whether non-matches are included in the output * @tparam InputIt Device accessible random access input iterator where * `std::is_convertible::value_type, From 777bfbc86b56ad710f73a78b7f129acf72c0b381 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 7 Dec 2021 10:07:54 -0500 Subject: [PATCH 53/70] Remove non-shmem pair retrieve kernel --- .../cuco/detail/static_multimap/kernels.cuh | 69 ------------------- 1 file changed, 69 deletions(-) diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh index c737e0f05..83aa2c640 100644 --- a/include/cuco/detail/static_multimap/kernels.cuh +++ b/include/cuco/detail/static_multimap/kernels.cuh @@ -412,75 +412,6 @@ __global__ void retrieve(InputIt first, } } -/** - * @brief Retrieves all pairs matching the input probe pair in the range `[first, last)`. - * - * If pair_equal(*(first + i), slot[j]) returns true, then *(first+i) is stored to unspecified - * locations in `probe_output_begin`, and slot[j] is stored to unspecified locations in - * `contained_output_begin`. If the given pair has no matches in the map, copies *(first + i) in - * `probe_output_begin` and a pair of `empty_key_sentinel` and `empty_value_sentinel` in - * `contained_output_begin` only when `is_outer` is `true`. - * - * Behavior is undefined if the total number of matching pairs exceeds `std::distance(output_begin, - * output_begin + *num_matches - 1)`. Use `pair_count()` to determine the size of the output range. - * - * @tparam block_size The size of the thread block - * @tparam probing_cg_size The size of the CG for parallel retrievals - * @tparam is_outer Boolean flag indicating whether non-matches are included in the output - * @tparam InputIt Device accessible random access input iterator where - * `std::is_convertible::value_type, - * static_multimap::value_type>` is `true` - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. - * @tparam atomicT Type of atomic storage - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam PairEqual Binary callable type - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param probe_output_begin Beginning of the sequence of the matched probe pairs - * @param contained_output_begin Beginning of the sequence of the matched contained pairs - * @param num_matches Size of the output sequence - * @param view Device view used to access the hash map's slot storage - * @param pair_equal The binary function to compare two pairs for equality - */ -template -__global__ void pair_retrieve(InputIt first, - InputIt last, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - atomicT* num_matches, - viewT view, - PairEqual pair_equal) -{ - using pair_type = typename viewT::value_type; - - auto probing_cg = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto pair_idx = tid / probing_cg_size; - - while (first + pair_idx < last) { - pair_type pair = *(first + pair_idx); - if constexpr (is_outer) { - view.pair_retrieve_outer( - probing_cg, pair, num_matches, probe_output_begin, contained_output_begin, pair_equal); - } else { - view.pair_retrieve( - probing_cg, pair, num_matches, probe_output_begin, contained_output_begin, pair_equal); - } - pair_idx += (gridDim.x * block_size) / probing_cg_size; - } -} - /** * @brief Retrieves all pairs matching the input probe pair in the range `[first, last)`. * From b787d856c54d8595ee906da7b229ee8862206c26 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 7 Dec 2021 12:02:30 -0500 Subject: [PATCH 54/70] Get rid of atomic counter argument for non-shmem pair retrieve --- .../static_multimap/device_view_impl.inl | 48 +++++++++++-------- .../static_multimap/static_multimap.inl | 10 ++-- include/cuco/static_multimap.cuh | 10 +--- 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index ce66a053d..81eda41b3 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1068,7 +1068,6 @@ class static_multimap::device_view_ * @tparam is_outer Boolean flag indicating whether outer join is peformed * @tparam uses_vector_load Boolean flag indicating whether vector loads are used * @tparam ProbingCG Type of Cooperative Group used to retrieve - * @tparam atomicT Type of atomic storage * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from * `InputIt`s `value_type`. * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from @@ -1076,7 +1075,6 @@ class static_multimap::device_view_ * @tparam PairEqual Binary callable type * @param probing_cg The Cooperative Group used to retrieve * @param pair The pair to search for - * @param num_matches Size of the output sequence * @param probe_output_begin Beginning of the output sequence of the matched probe pairs * @param contained_output_begin Beginning of the output sequence of the matched contained * pairs @@ -1085,21 +1083,28 @@ class static_multimap::device_view_ template __device__ __forceinline__ std::enable_if_t pair_retrieve( ProbingCG const& probing_cg, value_type const& pair, - atomicT* num_matches, OutputIt1 probe_output_begin, OutputIt2 contained_output_begin, PairEqual pair_equal) noexcept { + using cuda::std::memory_order_relaxed; + + auto const lane_id = probing_cg.thread_rank(); auto current_slot = initial_slot(probing_cg, pair.first); [[maybe_unused]] auto found_match = false; + __shared__ atomic_ctr_type counter; + if (lane_id == 0) { counter.store(0, memory_order_relaxed); } + + // no need to sync if `is_outer` is `true` due to the upcoming `cg.any()` + if constexpr (not is_outer) { probing_cg.sync(); } + while (true) { value_type arr[2]; load_pair_array(&arr[0], current_slot); @@ -1116,10 +1121,8 @@ class static_multimap::device_view_ if (exists) { found_match = true; } } - using cuda::std::memory_order_relaxed; - if (first_equals) { - auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + auto output_idx = counter.fetch_add(1, memory_order_relaxed); // TODO: `=` operator cannot work here // *(probe_output_begin + output_idx) = pair; // *(contained_output_begin + output_idx) = arr[0]; @@ -1129,7 +1132,7 @@ class static_multimap::device_view_ thrust::get<1>(*(contained_output_begin + output_idx)) = arr[0].second; } if (second_equals) { - auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + auto output_idx = counter.fetch_add(1, memory_order_relaxed); // TODO: `=` operator cannot work here // *(probe_output_begin + output_idx) = pair; // *(contained_output_begin + output_idx) = arr[1]; @@ -1141,8 +1144,8 @@ class static_multimap::device_view_ if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) { if constexpr (is_outer) { - if ((not found_match) and probing_cg.thread_rank() == 0) { - auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + if ((not found_match) and lane_id == 0) { + auto output_idx = counter.fetch_add(1, memory_order_relaxed); // TODO: `=` operator cannot work here // *(probe_output_begin + output_idx) = pair; // *(contained_output_begin + output_idx) = @@ -1155,7 +1158,7 @@ class static_multimap::device_view_ this->get_empty_value_sentinel(); } } - return; // exit if any slot in the window is empty + return; // exit if any slot in the current window is empty } current_slot = next_slot(current_slot); @@ -1175,7 +1178,6 @@ class static_multimap::device_view_ * @tparam is_outer Boolean flag indicating whether outer join is peformed * @tparam uses_vector_load Boolean flag indicating whether vector loads are used * @tparam ProbingCG Type of Cooperative Group used to retrieve - * @tparam atomicT Type of atomic storage * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from * `InputIt`s `value_type`. * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from @@ -1183,7 +1185,6 @@ class static_multimap::device_view_ * @tparam PairEqual Binary callable type * @param probing_cg The Cooperative Group used to retrieve * @param pair The pair to search for - * @param num_matches Size of the output sequence * @param probe_output_begin Beginning of the output sequence of the matched probe pairs * @param contained_output_begin Beginning of the output sequence of the matched contained * pairs @@ -1192,21 +1193,28 @@ class static_multimap::device_view_ template __device__ __forceinline__ std::enable_if_t pair_retrieve( ProbingCG const& probing_cg, value_type const& pair, - atomicT* num_matches, OutputIt1 probe_output_begin, OutputIt2 contained_output_begin, PairEqual pair_equal) noexcept { + using cuda::std::memory_order_relaxed; + + auto const lane_id = probing_cg.thread_rank(); auto current_slot = initial_slot(probing_cg, pair.first); [[maybe_unused]] auto found_match = false; + __shared__ atomic_ctr_type counter; + if (lane_id == 0) { counter.store(0, memory_order_relaxed); } + + // no need to sync if `is_outer` is `true` due to the upcoming `cg.any()` + if constexpr (not is_outer) { probing_cg.sync(); } + while (true) { // TODO: Replace reinterpret_cast with atomic ref when possible. The current implementation // is unsafe! @@ -1223,10 +1231,8 @@ class static_multimap::device_view_ if (exists) { found_match = true; } } - using cuda::std::memory_order_relaxed; - if (equals) { - auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + auto output_idx = counter.fetch_add(1, memory_order_relaxed); // TODO: `=` operator cannot work here // *(probe_output_begin + output_idx) = pair; // *(contained_output_begin + output_idx) = slot_contents; @@ -1238,8 +1244,8 @@ class static_multimap::device_view_ if (probing_cg.any(slot_is_empty)) { if constexpr (is_outer) { - if ((not found_match) and probing_cg.thread_rank() == 0) { - auto output_idx = num_matches->fetch_add(1, memory_order_relaxed); + if ((not found_match) and lane_id == 0) { + auto output_idx = counter.fetch_add(1, memory_order_relaxed); // TODO: `=` operator cannot work here // *(probe_output_begin + output_idx) = pair; // *(contained_output_begin + output_idx) = @@ -1252,7 +1258,7 @@ class static_multimap::device_view_ this->get_empty_value_sentinel(); } } - return; // exit if any slot in the window is empty + return; // exit if any slot in the current window is empty } current_slot = next_slot(current_slot); diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 722fccdae..4002ba7f2 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -691,19 +691,18 @@ template -template +template __device__ __forceinline__ void static_multimap::device_view::pair_retrieve( cooperative_groups::thread_block_tile const& probing_cg, value_type const& pair, - atomicT* num_matches, OutputIt1 probe_output_begin, OutputIt2 contained_output_begin, PairEqual pair_equal) noexcept { constexpr bool is_outer = false; impl_.pair_retrieve( - probing_cg, pair, num_matches, probe_output_begin, contained_output_begin, pair_equal); + probing_cg, pair, probe_output_begin, contained_output_begin, pair_equal); } template -template +template __device__ __forceinline__ void static_multimap::device_view::pair_retrieve_outer( cooperative_groups::thread_block_tile const& probing_cg, value_type const& pair, - atomicT* num_matches, OutputIt1 probe_output_begin, OutputIt2 contained_output_begin, PairEqual pair_equal) noexcept { constexpr bool is_outer = true; impl_.pair_retrieve( - probing_cg, pair, num_matches, probe_output_begin, contained_output_begin, pair_equal); + probing_cg, pair, probe_output_begin, contained_output_begin, pair_equal); } template + template __device__ __forceinline__ void pair_retrieve( cooperative_groups::thread_block_tile const& probing_cg, value_type const& pair, - atomicT* num_matches, OutputIt1 probe_output_begin, OutputIt2 contained_output_begin, PairEqual pair_equal) noexcept; @@ -1045,7 +1042,6 @@ class static_multimap { * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output. * - * @tparam atomicT Type of atomic storage * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from * `InputIt`s `value_type`. * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from @@ -1053,17 +1049,15 @@ class static_multimap { * @tparam PairEqual Binary callable type * @param probing_cg The Cooperative Group used to retrieve * @param pair The pair to search for - * @param num_matches Size of the output sequence * @param probe_output_begin Beginning of the output sequence of the matched probe pairs * @param contained_output_begin Beginning of the output sequence of the matched contained * pairs * @param pair_equal The binary callable used to compare two pairs for equality */ - template + template __device__ __forceinline__ void pair_retrieve_outer( cooperative_groups::thread_block_tile const& probing_cg, value_type const& pair, - atomicT* num_matches, OutputIt1 probe_output_begin, OutputIt2 contained_output_begin, PairEqual pair_equal) noexcept; From 400e0e44dce040ed63b3e9e4f36303c3b9f6a9c9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 7 Dec 2021 14:01:25 -0500 Subject: [PATCH 55/70] Use = operator for pair assignment --- .../static_multimap/device_view_impl.inl | 63 +++++-------------- 1 file changed, 17 insertions(+), 46 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 81eda41b3..b5851d4c1 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1122,40 +1122,23 @@ class static_multimap::device_view_ } if (first_equals) { - auto output_idx = counter.fetch_add(1, memory_order_relaxed); - // TODO: `=` operator cannot work here - // *(probe_output_begin + output_idx) = pair; - // *(contained_output_begin + output_idx) = arr[0]; - thrust::get<0>(*(probe_output_begin + output_idx)) = pair.first; - thrust::get<1>(*(probe_output_begin + output_idx)) = pair.second; - thrust::get<0>(*(contained_output_begin + output_idx)) = arr[0].first; - thrust::get<1>(*(contained_output_begin + output_idx)) = arr[0].second; + auto output_idx = counter.fetch_add(1, memory_order_relaxed); + *(probe_output_begin + output_idx) = pair; + *(contained_output_begin + output_idx) = arr[0]; } if (second_equals) { - auto output_idx = counter.fetch_add(1, memory_order_relaxed); - // TODO: `=` operator cannot work here - // *(probe_output_begin + output_idx) = pair; - // *(contained_output_begin + output_idx) = arr[1]; - thrust::get<0>(*(probe_output_begin + output_idx)) = pair.first; - thrust::get<1>(*(probe_output_begin + output_idx)) = pair.second; - thrust::get<0>(*(contained_output_begin + output_idx)) = arr[1].first; - thrust::get<1>(*(contained_output_begin + output_idx)) = arr[1].second; + auto output_idx = counter.fetch_add(1, memory_order_relaxed); + *(probe_output_begin + output_idx) = pair; + *(contained_output_begin + output_idx) = arr[1]; } if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) { if constexpr (is_outer) { if ((not found_match) and lane_id == 0) { - auto output_idx = counter.fetch_add(1, memory_order_relaxed); - // TODO: `=` operator cannot work here - // *(probe_output_begin + output_idx) = pair; - // *(contained_output_begin + output_idx) = - // cuco::make_pair(std::move(this->get_empty_key_sentinel()), - // std::move(this->get_empty_value_sentinel())); - thrust::get<0>(*(probe_output_begin + output_idx)) = pair.first; - thrust::get<1>(*(probe_output_begin + output_idx)) = pair.second; - thrust::get<0>(*(contained_output_begin + output_idx)) = this->get_empty_key_sentinel(); - thrust::get<1>(*(contained_output_begin + output_idx)) = - this->get_empty_value_sentinel(); + auto output_idx = counter.fetch_add(1, memory_order_relaxed); + *(probe_output_begin + output_idx) = pair; + *(contained_output_begin + output_idx) = cuco::make_pair( + this->get_empty_key_sentinel(), this->get_empty_value_sentinel()); } } return; // exit if any slot in the current window is empty @@ -1232,30 +1215,18 @@ class static_multimap::device_view_ } if (equals) { - auto output_idx = counter.fetch_add(1, memory_order_relaxed); - // TODO: `=` operator cannot work here - // *(probe_output_begin + output_idx) = pair; - // *(contained_output_begin + output_idx) = slot_contents; - thrust::get<0>(*(probe_output_begin + output_idx)) = pair.first; - thrust::get<1>(*(probe_output_begin + output_idx)) = pair.second; - thrust::get<0>(*(contained_output_begin + output_idx)) = slot_contents.first; - thrust::get<1>(*(contained_output_begin + output_idx)) = slot_contents.second; + auto output_idx = counter.fetch_add(1, memory_order_relaxed); + *(probe_output_begin + output_idx) = pair; + *(contained_output_begin + output_idx) = slot_contents; } if (probing_cg.any(slot_is_empty)) { if constexpr (is_outer) { if ((not found_match) and lane_id == 0) { - auto output_idx = counter.fetch_add(1, memory_order_relaxed); - // TODO: `=` operator cannot work here - // *(probe_output_begin + output_idx) = pair; - // *(contained_output_begin + output_idx) = - // cuco::make_pair(std::move(this->get_empty_key_sentinel()), - // std::move(this->get_empty_value_sentinel())); - thrust::get<0>(*(probe_output_begin + output_idx)) = pair.first; - thrust::get<1>(*(probe_output_begin + output_idx)) = pair.second; - thrust::get<0>(*(contained_output_begin + output_idx)) = this->get_empty_key_sentinel(); - thrust::get<1>(*(contained_output_begin + output_idx)) = - this->get_empty_value_sentinel(); + auto output_idx = counter.fetch_add(1, memory_order_relaxed); + *(probe_output_begin + output_idx) = pair; + *(contained_output_begin + output_idx) = cuco::make_pair( + this->get_empty_key_sentinel(), this->get_empty_value_sentinel()); } } return; // exit if any slot in the current window is empty From 0087120b8ec713461f3a6be8e264600693833634 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 7 Dec 2021 21:07:47 -0500 Subject: [PATCH 56/70] Get rid of shared memory counter by using cg.ballot() --- .../static_multimap/device_view_impl.inl | 73 +++++++------------ 1 file changed, 28 insertions(+), 45 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index b5851d4c1..65298a106 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1093,18 +1093,10 @@ class static_multimap::device_view_ OutputIt2 contained_output_begin, PairEqual pair_equal) noexcept { - using cuda::std::memory_order_relaxed; - auto const lane_id = probing_cg.thread_rank(); auto current_slot = initial_slot(probing_cg, pair.first); [[maybe_unused]] auto found_match = false; - __shared__ atomic_ctr_type counter; - if (lane_id == 0) { counter.store(0, memory_order_relaxed); } - - // no need to sync if `is_outer` is `true` due to the upcoming `cg.any()` - if constexpr (not is_outer) { probing_cg.sync(); } - while (true) { value_type arr[2]; load_pair_array(&arr[0], current_slot); @@ -1115,29 +1107,30 @@ class static_multimap::device_view_ detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel()); auto const first_equals = (not first_slot_is_empty and pair_equal(arr[0], pair)); auto const second_equals = (not second_slot_is_empty and pair_equal(arr[1], pair)); + auto const first_exists = probing_cg.ballot(first_equals); + auto const second_exists = probing_cg.ballot(second_equals); - if constexpr (is_outer) { - auto const exists = probing_cg.any(first_equals or second_equals); - if (exists) { found_match = true; } - } + if (first_exists or second_exists) { + if constexpr (is_outer) { found_match = true; } - if (first_equals) { - auto output_idx = counter.fetch_add(1, memory_order_relaxed); - *(probe_output_begin + output_idx) = pair; - *(contained_output_begin + output_idx) = arr[0]; - } - if (second_equals) { - auto output_idx = counter.fetch_add(1, memory_order_relaxed); - *(probe_output_begin + output_idx) = pair; - *(contained_output_begin + output_idx) = arr[1]; - } + auto const num_first_matches = __popc(first_exists); + if (first_equals) { + auto lane_offset = __popc(first_exists & ((1 << lane_id) - 1)); + *(probe_output_begin + lane_offset) = pair; + *(contained_output_begin + lane_offset) = arr[0]; + } + if (second_equals) { + auto lane_offset = __popc(second_exists & ((1 << lane_id) - 1)); + *(probe_output_begin + num_first_matches + lane_id) = pair; + *(contained_output_begin + num_first_matches + lane_id) = arr[1]; + } + } if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) { if constexpr (is_outer) { if ((not found_match) and lane_id == 0) { - auto output_idx = counter.fetch_add(1, memory_order_relaxed); - *(probe_output_begin + output_idx) = pair; - *(contained_output_begin + output_idx) = cuco::make_pair( + *(probe_output_begin) = pair; + *(contained_output_begin) = cuco::make_pair( this->get_empty_key_sentinel(), this->get_empty_value_sentinel()); } } @@ -1186,18 +1179,10 @@ class static_multimap::device_view_ OutputIt2 contained_output_begin, PairEqual pair_equal) noexcept { - using cuda::std::memory_order_relaxed; - auto const lane_id = probing_cg.thread_rank(); auto current_slot = initial_slot(probing_cg, pair.first); [[maybe_unused]] auto found_match = false; - __shared__ atomic_ctr_type counter; - if (lane_id == 0) { counter.store(0, memory_order_relaxed); } - - // no need to sync if `is_outer` is `true` due to the upcoming `cg.any()` - if constexpr (not is_outer) { probing_cg.sync(); } - while (true) { // TODO: Replace reinterpret_cast with atomic ref when possible. The current implementation // is unsafe! @@ -1208,24 +1193,22 @@ class static_multimap::device_view_ auto const slot_is_empty = detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel()); auto const equals = (not slot_is_empty and pair_equal(slot_contents, pair)); + auto const exists = probing_cg.ballot(equals); - if constexpr (is_outer) { - auto const exists = probing_cg.any(equals); - if (exists) { found_match = true; } - } + if (exists) { + if constexpr (is_outer) { found_match = true; } - if (equals) { - auto output_idx = counter.fetch_add(1, memory_order_relaxed); - *(probe_output_begin + output_idx) = pair; - *(contained_output_begin + output_idx) = slot_contents; + if (equals) { + auto const lane_offset = __popc(exists & ((1 << lane_id) - 1)); + *(probe_output_begin + lane_offset) = pair; + *(contained_output_begin + lane_offset) = slot_contents; + } } - if (probing_cg.any(slot_is_empty)) { if constexpr (is_outer) { if ((not found_match) and lane_id == 0) { - auto output_idx = counter.fetch_add(1, memory_order_relaxed); - *(probe_output_begin + output_idx) = pair; - *(contained_output_begin + output_idx) = cuco::make_pair( + *(probe_output_begin) = pair; + *(contained_output_begin) = cuco::make_pair( this->get_empty_key_sentinel(), this->get_empty_value_sentinel()); } } From 50635ff333c49b6ad5fa36f6540c55648cbab44a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 8 Dec 2021 14:46:28 -0500 Subject: [PATCH 57/70] Force type conversion for iterator value_type --- .../static_multimap/device_view_impl.inl | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 65298a106..2028f88e5 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1093,6 +1093,9 @@ class static_multimap::device_view_ OutputIt2 contained_output_begin, PairEqual pair_equal) noexcept { + using ProbePairType = typename thrust::iterator_traits::value_type; + using ContainedPairType = typename thrust::iterator_traits::value_type; + auto const lane_id = probing_cg.thread_rank(); auto current_slot = initial_slot(probing_cg, pair.first); [[maybe_unused]] auto found_match = false; @@ -1117,21 +1120,21 @@ class static_multimap::device_view_ if (first_equals) { auto lane_offset = __popc(first_exists & ((1 << lane_id) - 1)); - *(probe_output_begin + lane_offset) = pair; - *(contained_output_begin + lane_offset) = arr[0]; + *(probe_output_begin + lane_offset) = ProbePairType{pair}; + *(contained_output_begin + lane_offset) = ContainedPairType{arr[0]}; } if (second_equals) { auto lane_offset = __popc(second_exists & ((1 << lane_id) - 1)); - *(probe_output_begin + num_first_matches + lane_id) = pair; - *(contained_output_begin + num_first_matches + lane_id) = arr[1]; + *(probe_output_begin + num_first_matches + lane_id) = ProbePairType{pair}; + *(contained_output_begin + num_first_matches + lane_id) = ContainedPairType{arr[1]}; } } if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) { if constexpr (is_outer) { if ((not found_match) and lane_id == 0) { - *(probe_output_begin) = pair; - *(contained_output_begin) = cuco::make_pair( - this->get_empty_key_sentinel(), this->get_empty_value_sentinel()); + *(probe_output_begin) = ProbePairType{pair}; + *(contained_output_begin) = ContainedPairType{cuco::make_pair( + this->get_empty_key_sentinel(), this->get_empty_value_sentinel())}; } } return; // exit if any slot in the current window is empty @@ -1179,6 +1182,9 @@ class static_multimap::device_view_ OutputIt2 contained_output_begin, PairEqual pair_equal) noexcept { + using ProbePairType = typename thrust::iterator_traits::value_type; + using ContainedPairType = typename thrust::iterator_traits::value_type; + auto const lane_id = probing_cg.thread_rank(); auto current_slot = initial_slot(probing_cg, pair.first); [[maybe_unused]] auto found_match = false; @@ -1200,16 +1206,16 @@ class static_multimap::device_view_ if (equals) { auto const lane_offset = __popc(exists & ((1 << lane_id) - 1)); - *(probe_output_begin + lane_offset) = pair; - *(contained_output_begin + lane_offset) = slot_contents; + *(probe_output_begin + lane_offset) = ProbePairType{pair}; + *(contained_output_begin + lane_offset) = ContainedPairType{slot_contents}; } } if (probing_cg.any(slot_is_empty)) { if constexpr (is_outer) { if ((not found_match) and lane_id == 0) { - *(probe_output_begin) = pair; - *(contained_output_begin) = cuco::make_pair( - this->get_empty_key_sentinel(), this->get_empty_value_sentinel()); + *(probe_output_begin) = ProbePairType{pair}; + *(contained_output_begin) = ContainedPairType{cuco::make_pair( + this->get_empty_key_sentinel(), this->get_empty_value_sentinel())}; } } return; // exit if any slot in the current window is empty From 600b8ab7c053b916849739e351a6e54b6312f5ac Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 8 Dec 2021 18:30:25 -0500 Subject: [PATCH 58/70] Fix an output offset bug --- .../static_multimap/device_view_impl.inl | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 2028f88e5..876c1e118 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1100,6 +1100,8 @@ class static_multimap::device_view_ auto current_slot = initial_slot(probing_cg, pair.first); [[maybe_unused]] auto found_match = false; + auto num_matches = 0; + while (true) { value_type arr[2]; load_pair_array(&arr[0], current_slot); @@ -1119,15 +1121,18 @@ class static_multimap::device_view_ auto const num_first_matches = __popc(first_exists); if (first_equals) { - auto lane_offset = __popc(first_exists & ((1 << lane_id) - 1)); - *(probe_output_begin + lane_offset) = ProbePairType{pair}; - *(contained_output_begin + lane_offset) = ContainedPairType{arr[0]}; + auto lane_offset = __popc(first_exists & ((1 << lane_id) - 1)); + *(probe_output_begin + num_matches + lane_offset) = ProbePairType{pair}; + *(contained_output_begin + num_matches + lane_offset) = ContainedPairType{arr[0]}; } if (second_equals) { auto lane_offset = __popc(second_exists & ((1 << lane_id) - 1)); - *(probe_output_begin + num_first_matches + lane_id) = ProbePairType{pair}; - *(contained_output_begin + num_first_matches + lane_id) = ContainedPairType{arr[1]}; + *(probe_output_begin + num_matches + num_first_matches + lane_offset) = + ProbePairType{pair}; + *(contained_output_begin + num_matches + num_first_matches + lane_offset) = + ContainedPairType{arr[1]}; } + num_matches += (num_first_matches + __popc(second_exists)); } if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) { if constexpr (is_outer) { @@ -1189,6 +1194,8 @@ class static_multimap::device_view_ auto current_slot = initial_slot(probing_cg, pair.first); [[maybe_unused]] auto found_match = false; + auto num_matches = 0; + while (true) { // TODO: Replace reinterpret_cast with atomic ref when possible. The current implementation // is unsafe! @@ -1205,10 +1212,11 @@ class static_multimap::device_view_ if constexpr (is_outer) { found_match = true; } if (equals) { - auto const lane_offset = __popc(exists & ((1 << lane_id) - 1)); - *(probe_output_begin + lane_offset) = ProbePairType{pair}; - *(contained_output_begin + lane_offset) = ContainedPairType{slot_contents}; + auto const lane_offset = __popc(exists & ((1 << lane_id) - 1)); + *(probe_output_begin + num_matches + lane_offset) = ProbePairType{pair}; + *(contained_output_begin + num_matches + lane_offset) = ContainedPairType{slot_contents}; } + num_matches += __popc(exists); } if (probing_cg.any(slot_is_empty)) { if constexpr (is_outer) { From 4da79c1e704699d949853bd8c6feda7f49ebe1df Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 8 Dec 2021 19:05:55 -0500 Subject: [PATCH 59/70] Rename utils hpp header --- include/cuco/detail/prime.hpp | 2 +- include/cuco/detail/static_multimap/static_multimap.inl | 2 +- include/cuco/detail/{util.hpp => utils.hpp} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename include/cuco/detail/{util.hpp => utils.hpp} (100%) diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index 2dceba653..93ddde1a0 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include namespace cuco { namespace detail { diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 4002ba7f2..1739efaca 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -18,7 +18,7 @@ #include #include -#include +#include namespace { /** diff --git a/include/cuco/detail/util.hpp b/include/cuco/detail/utils.hpp similarity index 100% rename from include/cuco/detail/util.hpp rename to include/cuco/detail/utils.hpp From 53b0969b73f601270ed974931fd5f60100cf6b06 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 8 Dec 2021 19:26:55 -0500 Subject: [PATCH 60/70] Add & use count_least_significant_bits --- .../static_multimap/device_view_impl.inl | 21 +++++++------ include/cuco/detail/utils.cuh | 31 +++++++++++++++++++ 2 files changed, 42 insertions(+), 10 deletions(-) create mode 100644 include/cuco/detail/utils.cuh diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 876c1e118..d80d91e45 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -15,6 +15,7 @@ */ #include +#include namespace cuco { @@ -924,13 +925,13 @@ class static_multimap::device_view_ output_idx = probing_cg.shfl(output_idx, 0); if (first_equals) { - auto lane_offset = __popc(first_exists & ((1 << cg_lane_id) - 1)); + auto lane_offset = detail::count_least_significant_bits(first_exists, cg_lane_id); Key key = k; output_buffer[output_idx + lane_offset] = cuco::make_pair(std::move(key), std::move(arr[0].second)); } if (second_equals) { - auto lane_offset = __popc(second_exists & ((1 << cg_lane_id) - 1)); + auto lane_offset = detail::count_least_significant_bits(second_exists, cg_lane_id); Key key = k; output_buffer[output_idx + num_first_matches + lane_offset] = cuco::make_pair(std::move(key), std::move(arr[1].second)); @@ -1024,7 +1025,7 @@ class static_multimap::device_view_ auto num_matches = __popc(exists); if (equals) { // Each match computes its lane-level offset - auto lane_offset = __popc(exists & ((1 << lane_id) - 1)); + auto lane_offset = detail::count_least_significant_bits(exists, lane_id); Key key = k; output_buffer[output_idx + lane_offset] = cuco::make_pair(std::move(key), std::move(slot_contents.second)); @@ -1121,12 +1122,12 @@ class static_multimap::device_view_ auto const num_first_matches = __popc(first_exists); if (first_equals) { - auto lane_offset = __popc(first_exists & ((1 << lane_id) - 1)); + auto lane_offset = detail::count_least_significant_bits(first_exists, lane_id); *(probe_output_begin + num_matches + lane_offset) = ProbePairType{pair}; *(contained_output_begin + num_matches + lane_offset) = ContainedPairType{arr[0]}; } if (second_equals) { - auto lane_offset = __popc(second_exists & ((1 << lane_id) - 1)); + auto lane_offset = detail::count_least_significant_bits(second_exists, lane_id); *(probe_output_begin + num_matches + num_first_matches + lane_offset) = ProbePairType{pair}; *(contained_output_begin + num_matches + num_first_matches + lane_offset) = @@ -1212,8 +1213,8 @@ class static_multimap::device_view_ if constexpr (is_outer) { found_match = true; } if (equals) { - auto const lane_offset = __popc(exists & ((1 << lane_id) - 1)); - *(probe_output_begin + num_matches + lane_offset) = ProbePairType{pair}; + auto const lane_offset = detail::count_least_significant_bits(exists, lane_id); + *(probe_output_begin + num_matches + lane_offset) = ProbePairType{pair}; *(contained_output_begin + num_matches + lane_offset) = ContainedPairType{slot_contents}; } num_matches += __popc(exists); @@ -1319,12 +1320,12 @@ class static_multimap::device_view_ output_idx = probing_cg.shfl(output_idx, 0); if (first_equals) { - auto lane_offset = __popc(first_exists & ((1 << cg_lane_id) - 1)); + auto lane_offset = detail::count_least_significant_bits(first_exists, cg_lane_id); probe_output_buffer[output_idx + lane_offset] = pair; contained_output_buffer[output_idx + lane_offset] = arr[0]; } if (second_equals) { - auto lane_offset = __popc(second_exists & ((1 << cg_lane_id) - 1)); + auto lane_offset = detail::count_least_significant_bits(second_exists, cg_lane_id); probe_output_buffer[output_idx + num_first_matches + lane_offset] = pair; contained_output_buffer[output_idx + num_first_matches + lane_offset] = arr[1]; } @@ -1433,7 +1434,7 @@ class static_multimap::device_view_ auto num_matches = __popc(exists); if (equals) { // Each match computes its lane-level offset - auto lane_offset = __popc(exists & ((1 << lane_id) - 1)); + auto lane_offset = detail::count_least_significant_bits(exists, lane_id); probe_output_buffer[output_idx + lane_offset] = pair; contained_output_buffer[output_idx + lane_offset] = slot_contents; } diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh new file mode 100644 index 000000000..2f271a742 --- /dev/null +++ b/include/cuco/detail/utils.cuh @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +namespace cuco { +namespace detail { + +/** + * @brief For the `n` least significant bits in the given unsigned 32-bit integer `x`, + * returns the number of set bits. + */ +__device__ __forceinline__ int32_t count_least_significant_bits(uint32_t x, int32_t n) +{ + return __popc(x & (1 << n) - 1); +} + +} // namespace detail +} // namespace cuco From c01d987d63a3105b977b98a50dc6faca603cc06f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 8 Dec 2021 19:28:28 -0500 Subject: [PATCH 61/70] Add unit tests for non-shmem pair_retrieve --- tests/static_multimap/static_multimap_test.cu | 165 ++++++++++++++++++ 1 file changed, 165 insertions(+) diff --git a/tests/static_multimap/static_multimap_test.cu b/tests/static_multimap/static_multimap_test.cu index 0140d3223..34f316fd8 100644 --- a/tests/static_multimap/static_multimap_test.cu +++ b/tests/static_multimap/static_multimap_test.cu @@ -699,3 +699,168 @@ TEMPLATE_TEST_CASE_SIG("Tests of pair functions", test_pair_functions(map, d_pairs.begin(), num_pairs); } } + +template +__global__ void custom_pair_retrieve_outer(InputIt first, + InputIt last, + OutputIt1 probe_output_begin, + OutputIt2 contained_output_begin, + ScanIt scan_begin, + viewT view, + PairEqual pair_equal) +{ + auto g = cg::tiled_partition(cg::this_thread_block()); + auto tid = block_size * blockIdx.x + threadIdx.x; + auto pair_idx = tid / cg_size; + + while (first + pair_idx < last) { + auto const offset = *(scan_begin + pair_idx); + auto const pair = *(first + pair_idx); + view.pair_retrieve_outer( + g, pair, probe_output_begin + offset, contained_output_begin + offset, pair_equal); + pair_idx += (gridDim.x * block_size) / cg_size; + } +} + +template +void test_non_shmem_pair_retrieve(Map& map, std::size_t const num_pairs) +{ + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + thrust::device_vector> d_pairs(num_pairs); + + // pair multiplicity = 2 + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_pairs), + d_pairs.begin(), + [] __device__(auto i) { + return cuco::pair_type{i / 2, i}; + }); + + auto pair_begin = d_pairs.begin(); + + map.insert(pair_begin, pair_begin + num_pairs); + + // query pair matching rate = 50% + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(num_pairs), + pair_begin, + [] __device__(auto i) { + return cuco::pair_type{i, i}; + }); + + // create an array of prefix sum + thrust::device_vector d_scan(num_pairs); + auto count_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [num_pairs] __device__(auto i) { return i < (num_pairs / 2) ? 2 : 1; }); + thrust::exclusive_scan(thrust::device, count_begin, count_begin + num_pairs, d_scan.begin(), 0); + + auto constexpr gold_size = 300; + auto constexpr block_size = 128; + auto constexpr cg_size = map.cg_size(); + + auto const grid_size = (cg_size * num_pairs + block_size - 1) / block_size; + + auto view = map.get_device_view(); + + auto num = map.pair_count_outer(pair_begin, pair_begin + num_pairs, pair_equal{}); + REQUIRE(num == gold_size); + + thrust::device_vector> probe_pairs(gold_size); + thrust::device_vector> contained_pairs(gold_size); + + custom_pair_retrieve_outer + <<>>(pair_begin, + pair_begin + num_pairs, + probe_pairs.begin(), + contained_pairs.begin(), + d_scan.begin(), + view, + pair_equal{}); + + // sort before compare + thrust::sort(thrust::device, + probe_pairs.begin(), + probe_pairs.end(), + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + if (lhs.first == rhs.first) { return lhs.second < rhs.second; } + return lhs.first < rhs.first; + }); + thrust::sort(thrust::device, + contained_pairs.begin(), + contained_pairs.end(), + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + if (lhs.first == rhs.first) { return lhs.second < rhs.second; } + return lhs.first < rhs.first; + }); + + // set gold references + auto gold_probe_pairs = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [num_pairs] __device__(auto i) { + if (i < num_pairs) { return cuco::pair{i / 2, i / 2}; } + auto val = i - (num_pairs / 2); + return cuco::pair{val, val}; + }); + auto gold_contained_pairs = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [num_pairs] __device__(auto i) { + if (i < num_pairs / 2) { return cuco::pair{-1, -1}; } + auto val = i - (num_pairs / 2); + return cuco::pair{val / 2, val}; + }); + + REQUIRE( + thrust::equal(thrust::device, + probe_pairs.begin(), + probe_pairs.begin() + gold_size, + gold_probe_pairs, + [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); + + REQUIRE( + thrust::equal(thrust::device, + contained_pairs.begin(), + contained_pairs.begin() + gold_size, + gold_contained_pairs, + [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); +} + +TEMPLATE_TEST_CASE_SIG("Tests of non-shared-memory pair_retrieve", + "", + ((typename Key, typename Value, probe_sequence Probe), Key, Value, Probe), + (int32_t, int32_t, probe_sequence::linear_probing), + (int32_t, int64_t, probe_sequence::linear_probing), + (int64_t, int64_t, probe_sequence::linear_probing), + (int32_t, int32_t, probe_sequence::double_hashing), + (int32_t, int64_t, probe_sequence::double_hashing), + (int64_t, int64_t, probe_sequence::double_hashing)) +{ + constexpr std::size_t num_pairs{200}; + + if constexpr (Probe == probe_sequence::linear_probing) { + cuco::static_multimap, + cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> + map{num_pairs * 2, -1, -1}; + test_non_shmem_pair_retrieve(map, num_pairs); + } + if constexpr (Probe == probe_sequence::double_hashing) { + cuco::static_multimap map{num_pairs * 2, -1, -1}; + test_non_shmem_pair_retrieve(map, num_pairs); + } +} From 34a77c13d563d6207bcefead963ff3a9a7bfccf6 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 8 Dec 2021 19:48:23 -0500 Subject: [PATCH 62/70] Update docs --- .../static_multimap/device_view_impl.inl | 22 ++++++++++--------- include/cuco/static_multimap.cuh | 18 +++++++++------ 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index d80d91e45..bb59f47b6 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1060,11 +1060,12 @@ class static_multimap::device_view_ * @brief Retrieves all the matches of a given pair contained in multimap using vector * loads without shared memory buffer. * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations - * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in - * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies - * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output only if - * `is_outer` is true. + * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations in + * `[probe_output_begin, probe_output_begin + n)` and copies slot[j] to unspecified locations in + * `[contained_output_begin, contained_output_begin + n)`. It's users responsibility to ensure + * these locations are valid and no other threads will attempt to write to overlapping locations. + * If `p` does not have any matches, copies `p` and a pair of `empty_key_sentinel` and + * `empty_value_sentinel` into the output only if `is_outer` is true. * * @tparam is_outer Boolean flag indicating whether outer join is peformed * @tparam uses_vector_load Boolean flag indicating whether vector loads are used @@ -1154,11 +1155,12 @@ class static_multimap::device_view_ * @brief Retrieves all the matches of a given pair contained in multimap using scalar * loads without shared memory buffer. * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations - * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in - * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies - * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output only if - * `is_outer` is true. + * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations in + * `[probe_output_begin, probe_output_begin + n)` and copies slot[j] to unspecified locations in + * `[contained_output_begin, contained_output_begin + n)`. It's users responsibility to ensure + * these locations are valid and no other threads will attempt to write to overlapping locations. + * If `p` does not have any matches, copies `p` and a pair of `empty_key_sentinel` and + * `empty_value_sentinel` into the output only if `is_outer` is true. * * @tparam is_outer Boolean flag indicating whether outer join is peformed * @tparam uses_vector_load Boolean flag indicating whether vector loads are used diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index eb3dc8167..ec543ec1e 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -963,9 +963,11 @@ class static_multimap { * @brief Retrieves all the matches of a given pair contained in multimap without using shared * memory buffer * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations - * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in - * `[contained_output_begin, contained_output_end)`. + * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations in + * `[probe_output_begin, probe_output_begin + n)` and copies slot[j] to unspecified locations in + * `[contained_output_begin, contained_output_begin + n)`. It's users responsibility to ensure + * these locations are valid and no other threads will attempt to write to overlapping + * locations. * * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from * `InputIt`s `value_type`. @@ -1037,10 +1039,12 @@ class static_multimap { * @brief Retrieves all the matches of a given pair contained in multimap without using shared * memory buffer * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations - * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in - * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies - * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output. + * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations in + * `[probe_output_begin, probe_output_begin + n)` and copies slot[j] to unspecified locations in + * `[contained_output_begin, contained_output_begin + n)`. It's users responsibility to ensure + * these locations are valid and no other threads will attempt to write to overlapping + * locations. If `p` does not have any matches, copies `p` and a pair of `empty_key_sentinel` + * and `empty_value_sentinel` into the output. * * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from * `InputIt`s `value_type`. From 8918f18a7b68de79039ccee1eb88a3ebbc114edd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 9 Dec 2021 11:19:06 -0500 Subject: [PATCH 63/70] Code formatting --- benchmarks/hash_table/dynamic_map_bench.cu | 100 ++++--- benchmarks/hash_table/static_map_bench.cu | 105 ++++--- examples/static_map/static_map_example.cu | 15 +- include/cuco/detail/dynamic_map_kernels.cuh | 301 ++++++++++---------- include/cuco/detail/static_map.inl | 42 +-- include/cuco/static_map.cuh | 21 +- tests/dynamic_map/dynamic_map_test.cu | 72 ++--- tests/static_map/static_map_test.cu | 25 +- 8 files changed, 339 insertions(+), 342 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index cd6eadb47..995e53903 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -15,120 +15,118 @@ */ #include -#include #include #include #include +#include -enum class dist_type { - UNIQUE, - UNIFORM, - GAUSSIAN -}; +enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) { +template +static void generate_keys(OutputIt output_begin, OutputIt output_end) +{ auto num_keys = std::distance(output_begin, output_end); - + std::random_device rd; std::mt19937 gen{rd()}; - switch(Dist) { + switch (Dist) { case dist_type::UNIQUE: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = i; } break; case dist_type::UNIFORM: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(gen())); } break; case dist_type::GAUSSIAN: std::normal_distribution<> dg{1e9, 1e7}; - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(dg(gen))); } break; } } -static void gen_final_size(benchmark::internal::Benchmark* b) { - for(auto size = 10'000'000; size <= 150'000'000; size += 20'000'000) { +static void gen_final_size(benchmark::internal::Benchmark* b) +{ + for (auto size = 10'000'000; size <= 150'000'000; size += 20'000'000) { b->Args({size}); } } template -static void BM_dynamic_insert(::benchmark::State& state) { +static void BM_dynamic_insert(::benchmark::State& state) +{ using map_type = cuco::dynamic_map; - - std::size_t num_keys = state.range(0); - std::size_t initial_size = 1<<27; - - std::vector h_keys( num_keys ); - std::vector> h_pairs ( num_keys ); - + + std::size_t num_keys = state.range(0); + std::size_t initial_size = 1 << 27; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); + generate_keys(h_keys.begin(), h_keys.end()); - for(auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; h_pairs[i].second = val; } - thrust::device_vector> d_pairs( h_pairs ); + thrust::device_vector> d_pairs(h_pairs); std::size_t batch_size = 1E6; - for(auto _ : state) { + for (auto _ : state) { map_type map{initial_size, -1, -1}; { - cuda_event_timer raii{state}; - for(auto i = 0; i < num_keys; i += batch_size) { + cuda_event_timer raii{state}; + for (auto i = 0; i < num_keys; i += batch_size) { map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); } } } - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * - int64_t(state.iterations()) * + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * int64_t(state.range(0))); } template -static void BM_dynamic_search_all(::benchmark::State& state) { +static void BM_dynamic_search_all(::benchmark::State& state) +{ using map_type = cuco::dynamic_map; - - std::size_t num_keys = state.range(0); - std::size_t initial_size = 1<<27; - std::vector h_keys( num_keys ); - std::vector> h_pairs ( num_keys ); + std::size_t num_keys = state.range(0); + std::size_t initial_size = 1 << 27; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); generate_keys(h_keys.begin(), h_keys.end()); - - for(auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; h_pairs[i].second = val; } - thrust::device_vector d_keys( h_keys ); - thrust::device_vector> d_pairs( h_pairs ); - thrust::device_vector d_results( num_keys ); + thrust::device_vector d_keys(h_keys); + thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector d_results(num_keys); map_type map{initial_size, -1, -1}; map.insert(d_pairs.begin(), d_pairs.end()); - for(auto _ : state) { + for (auto _ : state) { cuda_event_timer raii{state}; map.find(d_keys.begin(), d_keys.end(), d_results.begin()); } - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * - int64_t(state.iterations()) * + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * int64_t(state.range(0))); } @@ -161,7 +159,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - + BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu index 165465518..af8e09de1 100644 --- a/benchmarks/hash_table/static_map_bench.cu +++ b/benchmarks/hash_table/static_map_bench.cu @@ -15,40 +15,37 @@ */ #include -#include "cuco/static_map.cuh" -#include #include -#include +#include #include +#include #include +#include "cuco/static_map.cuh" -enum class dist_type { - UNIQUE, - UNIFORM, - GAUSSIAN -}; +enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) { +template +static void generate_keys(OutputIt output_begin, OutputIt output_end) +{ auto num_keys = std::distance(output_begin, output_end); - + std::random_device rd; std::mt19937 gen{rd()}; - switch(Dist) { + switch (Dist) { case dist_type::UNIQUE: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = i; } break; case dist_type::UNIFORM: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(gen())); } break; case dist_type::GAUSSIAN: std::normal_distribution<> dg{1e9, 1e7}; - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(dg(gen))); } break; @@ -59,7 +56,8 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) { * @brief Generates input sizes and hash table occupancies * */ -static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) { +static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) +{ for (auto size = 100'000'000; size <= 100'000'000; size *= 10) { for (auto occupancy = 10; occupancy <= 90; occupancy += 10) { b->Args({size, occupancy}); @@ -67,31 +65,30 @@ static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) { } } - - template -static void BM_static_map_insert(::benchmark::State& state) { +static void BM_static_map_insert(::benchmark::State& state) +{ using map_type = cuco::static_map; - + std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; + float occupancy = state.range(1) / float{100}; + std::size_t size = num_keys / occupancy; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); - std::vector h_keys( num_keys ); - std::vector> h_pairs( num_keys ); - generate_keys(h_keys.begin(), h_keys.end()); - - for(auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; h_pairs[i].second = val; } - thrust::device_vector> d_pairs( h_pairs ); + thrust::device_vector> d_pairs(h_pairs); - for(auto _ : state) { + for (auto _ : state) { state.ResumeTiming(); state.PauseTiming(); map_type map{size, -1, -1}; @@ -102,45 +99,43 @@ static void BM_static_map_insert(::benchmark::State& state) { state.PauseTiming(); } - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * - int64_t(state.iterations()) * + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * int64_t(state.range(0))); } - - template -static void BM_static_map_search_all(::benchmark::State& state) { +static void BM_static_map_search_all(::benchmark::State& state) +{ using map_type = cuco::static_map; - + std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; + float occupancy = state.range(1) / float{100}; + std::size_t size = num_keys / occupancy; map_type map{size, -1, -1}; auto view = map.get_device_mutable_view(); - std::vector h_keys( num_keys ); - std::vector h_values( num_keys ); - std::vector> h_pairs ( num_keys ); - std::vector h_results (num_keys); + std::vector h_keys(num_keys); + std::vector h_values(num_keys); + std::vector> h_pairs(num_keys); + std::vector h_results(num_keys); generate_keys(h_keys.begin(), h_keys.end()); - - for(auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; h_pairs[i].second = val; } - thrust::device_vector d_keys( h_keys ); - thrust::device_vector d_results( num_keys); - thrust::device_vector> d_pairs( h_pairs ); + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_results(num_keys); + thrust::device_vector> d_pairs(h_pairs); map.insert(d_pairs.begin(), d_pairs.end()); - - for(auto _ : state) { + + for (auto _ : state) { map.find(d_keys.begin(), d_keys.end(), d_results.begin()); } @@ -148,8 +143,6 @@ static void BM_static_map_search_all(::benchmark::State& state) { int64_t(state.range(0))); } - - BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); diff --git a/examples/static_map/static_map_example.cu b/examples/static_map/static_map_example.cu index 743dc021a..12d12578d 100644 --- a/examples/static_map/static_map_example.cu +++ b/examples/static_map/static_map_example.cu @@ -32,7 +32,8 @@ int main(void) // for an load factor of 50%. cudaStream_t str; cudaStreamCreate(&str); - cuco::static_map map{100'000, empty_key_sentinel, empty_value_sentinel, cuco::cuda_allocator{}, str}; + cuco::static_map map{ + 100'000, empty_key_sentinel, empty_value_sentinel, cuco::cuda_allocator{}, str}; thrust::device_vector> pairs(50'000); @@ -43,8 +44,8 @@ int main(void) [] __device__(auto i) { return thrust::make_pair(i, i); }); // Inserts all pairs into the map - map.insert(pairs.begin(), pairs.end(), cuco::detail::MurmurHash3_32{}, - thrust::equal_to{}, str); + map.insert( + pairs.begin(), pairs.end(), cuco::detail::MurmurHash3_32{}, thrust::equal_to{}, str); // Sequence of keys {0, 1, 2, ...} thrust::device_vector keys_to_find(50'000); @@ -53,8 +54,12 @@ int main(void) // Finds all keys {0, 1, 2, ...} and stores associated values into `found_values` // If a key `keys_to_find[i]` doesn't exist, `found_values[i] == empty_value_sentinel` - map.find(keys_to_find.begin(), keys_to_find.end(), found_values.begin(), - cuco::detail::MurmurHash3_32{}, thrust::equal_to{}, str); + map.find(keys_to_find.begin(), + keys_to_find.end(), + found_values.begin(), + cuco::detail::MurmurHash3_32{}, + thrust::equal_to{}, + str); cudaStreamSynchronize(str); return 0; diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 28690b0dc..c1e21e863 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -19,17 +19,17 @@ namespace detail { namespace cg = cooperative_groups; /** - * @brief Inserts all key/value pairs in the range `[first, last)`. - * + * @brief Inserts all key/value pairs in the range `[first, last)`. + * * If multiple keys in `[first, last)` compare equal, it is unspecified which * element is inserted. - * - * @tparam block_size + * + * @tparam block_size * @tparam pair_type Type of the pairs contained in the map * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `value_type` * @tparam viewT Type of the `static_map` device views - * @tparam mutableViewT Type of the `static_map` device mutable views + * @tparam mutableViewT Type of the `static_map` device mutable views * @tparam atomicT Type of atomic storage * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type @@ -38,7 +38,7 @@ namespace cg = cooperative_groups; * @param last End of the sequence of key/value pairs * @param submap_views Array of `static_map::device_view` objects used to * perform `contains` operations on each underlying `static_map` - * @param submap_mutable_views Array of `static_map::device_mutable_view` objects + * @param submap_mutable_views Array of `static_map::device_mutable_view` objects * used to perform an `insert` into the target `static_map` submap * @param num_successes The number of successfully inserted key/value pairs * @param insert_idx The index of the submap we are inserting into @@ -46,14 +46,14 @@ namespace cg = cooperative_groups; * @param hash The unary function to apply to hash each key * @param key_equal The binary function used to compare two keys for equality */ -template +template __global__ void insert(InputIt first, InputIt last, viewT* submap_views, @@ -62,58 +62,55 @@ __global__ void insert(InputIt first, uint32_t insert_idx, uint32_t num_submaps, Hash hash, - KeyEqual key_equal) { + KeyEqual key_equal) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - + auto tid = blockDim.x * blockIdx.x + threadIdx.x; - while(first + tid < last) { + while (first + tid < last) { pair_type insert_pair = *(first + tid); - auto exists = false; - + auto exists = false; + // manually check for duplicates in those submaps we are not inserting into - for(auto i = 0; i < num_submaps; ++i) { - if(i != insert_idx) { + for (auto i = 0; i < num_submaps; ++i) { + if (i != insert_idx) { exists = submap_views[i].contains(insert_pair.first, hash, key_equal); - if(exists) { - break; - } + if (exists) { break; } } } - if(!exists) { - if(submap_mutable_views[insert_idx].insert(insert_pair, hash, key_equal)) { + if (!exists) { + if (submap_mutable_views[insert_idx].insert(insert_pair, hash, key_equal)) { thread_num_successes++; } } tid += gridDim.x * blockDim.x; } - + std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if(threadIdx.x == 0) { - *num_successes += block_num_successes; - } + if (threadIdx.x == 0) { *num_successes += block_num_successes; } } /** - * @brief Inserts all key/value pairs in the range `[first, last)`. - * + * @brief Inserts all key/value pairs in the range `[first, last)`. + * * If multiple keys in `[first, last)` compare equal, it is unspecified which * element is inserted. Uses the CUDA Cooperative Groups API to leverage groups - * of multiple threads to perform each key/value insertion. This provides a + * of multiple threads to perform each key/value insertion. This provides a * significant boost in throughput compared to the non Cooperative Group * `insert` at moderate to high load factors. - * - * @tparam block_size + * + * @tparam block_size * @tparam tile_size The number of threads in the Cooperative Groups used to perform * inserts * @tparam pair_type Type of the pairs contained in the map * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `value_type` * @tparam viewT Type of the `static_map` device views - * @tparam mutableViewT Type of the `static_map` device mutable views + * @tparam mutableViewT Type of the `static_map` device mutable views * @tparam atomicT Type of atomic storage * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type @@ -122,7 +119,7 @@ __global__ void insert(InputIt first, * @param last End of the sequence of key/value pairs * @param submap_views Array of `static_map::device_view` objects used to * perform `contains` operations on each underlying `static_map` - * @param submap_mutable_views Array of `static_map::device_mutable_view` objects + * @param submap_mutable_views Array of `static_map::device_mutable_view` objects * used to perform an `insert` into the target `static_map` submap * @param num_successes The number of successfully inserted key/value pairs * @param insert_idx The index of the submap we are inserting into @@ -130,14 +127,15 @@ __global__ void insert(InputIt first, * @param hash The unary function to apply to hash each key * @param key_equal The binary function used to compare two keys for equality */ -template +template __global__ void insert(InputIt first, InputIt last, viewT* submap_views, @@ -146,54 +144,51 @@ __global__ void insert(InputIt first, uint32_t insert_idx, uint32_t num_submaps, Hash hash, - KeyEqual key_equal) { + KeyEqual key_equal) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - + auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = blockDim.x * blockIdx.x + threadIdx.x; - auto it = first + tid / tile_size; + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto it = first + tid / tile_size; - while(it < last) { + while (it < last) { pair_type insert_pair = *it; - auto exists = false; - + auto exists = false; + // manually check for duplicates in those submaps we are not inserting into - for(auto i = 0; i < num_submaps; ++i) { - if(i != insert_idx) { + for (auto i = 0; i < num_submaps; ++i) { + if (i != insert_idx) { exists = submap_views[i].contains(tile, insert_pair.first, hash, key_equal); - if(exists) { - break; - } + if (exists) { break; } } } - if(!exists) { - if(submap_mutable_views[insert_idx].insert(tile, insert_pair, hash, key_equal) && - tile.thread_rank() == 0) { + if (!exists) { + if (submap_mutable_views[insert_idx].insert(tile, insert_pair, hash, key_equal) && + tile.thread_rank() == 0) { thread_num_successes++; } } it += (gridDim.x * blockDim.x) / tile_size; } - + std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if(threadIdx.x == 0) { - *num_successes += block_num_successes; - } + if (threadIdx.x == 0) { *num_successes += block_num_successes; } } /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. - * - * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. - * Else, copies the empty value sentinel. + * + * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. + * Else, copies the empty value sentinel. * @tparam block_size The number of threads in the thread block - * @tparam Value The mapped value type for the map + * @tparam Value The mapped value type for the map * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is + * @tparam OutputIt Device accessible output iterator whose `value_type` is * convertible to the map's `mapped_type` * @tparam viewT Type of `static_map` device view * @tparam Hash Unary callable type @@ -207,37 +202,39 @@ __global__ void insert(InputIt first, * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality */ -template +template __global__ void find(InputIt first, InputIt last, OutputIt output_begin, viewT* submap_views, uint32_t num_submaps, Hash hash, - KeyEqual key_equal) { - auto tid = blockDim.x * blockIdx.x + threadIdx.x; + KeyEqual key_equal) +{ + auto tid = blockDim.x * blockIdx.x + threadIdx.x; auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel(); __shared__ Value writeBuffer[block_size]; - while(first + tid < last) { - auto key = *(first + tid); + while (first + tid < last) { + auto key = *(first + tid); auto found_value = empty_value_sentinel; - for(auto i = 0; i < num_submaps; ++i) { + for (auto i = 0; i < num_submaps; ++i) { auto submap_view = submap_views[i]; - auto found = submap_view.find(key, hash, key_equal); - if(found != submap_view.end()) { + auto found = submap_view.find(key, hash, key_equal); + if (found != submap_view.end()) { found_value = found->second; break; } } - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to * flush more frequently, causing increased sector stores from L2 to global memory. * By writing results to shared memory and then synchronizing before writing back * to global, we no longer rely on L1, preventing the increase in sector stores from @@ -252,10 +249,10 @@ __global__ void find(InputIt first, /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. - * - * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. + * + * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. * Else, copies the empty value sentinel. Uses the CUDA Cooperative Groups API to leverage groups - * of multiple threads to find each key. This provides a significant boost in throughput compared + * of multiple threads to find each key. This provides a significant boost in throughput compared * to the non Cooperative Group `find` at moderate to high load factors. * * @tparam block_size The number of threads in the thread block @@ -264,7 +261,7 @@ __global__ void find(InputIt first, * @tparam Value The mapped value type for the map * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is + * @tparam OutputIt Device accessible output iterator whose `value_type` is * convertible to the map's `mapped_type` * @tparam viewT Type of `static_map` device view * @tparam Hash Unary callable type @@ -278,49 +275,50 @@ __global__ void find(InputIt first, * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality */ -template +template __global__ void find(InputIt first, InputIt last, OutputIt output_begin, viewT* submap_views, uint32_t num_submaps, Hash hash, - KeyEqual key_equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = blockDim.x * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; + KeyEqual key_equal) +{ + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel(); __shared__ Value writeBuffer[block_size]; - while(first + key_idx < last) { - auto key = *(first + key_idx); + while (first + key_idx < last) { + auto key = *(first + key_idx); auto found_value = empty_value_sentinel; - for(auto i = 0; i < num_submaps; ++i) { + for (auto i = 0; i < num_submaps; ++i) { auto submap_view = submap_views[i]; - auto found = submap_view.find(tile, key, hash, key_equal); - if(found != submap_view.end()) { + auto found = submap_view.find(tile, key, hash, key_equal); + if (found != submap_view.end()) { found_value = found->second; break; } } - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to * flush more frequently, causing increased sector stores from L2 to global memory. * By writing results to shared memory and then synchronizing before writing back * to global, we no longer rely on L1, preventing the increase in sector stores from * L2 to global and improving performance. */ - if(tile.thread_rank() == 0) { - writeBuffer[threadIdx.x / tile_size] = found_value; - } + if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found_value; } __syncthreads(); - if(tile.thread_rank() == 0) { + if (tile.thread_rank() == 0) { *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; } key_idx += (gridDim.x * blockDim.x) / tile_size; @@ -329,13 +327,13 @@ __global__ void find(InputIt first, /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * + * * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. * * @tparam block_size The number of threads in the thread block * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is + * @tparam OutputIt Device accessible output iterator whose `value_type` is * convertible to the map's `mapped_type` * @tparam viewT Type of `static_map` device view * @tparam Hash Unary callable type @@ -349,33 +347,33 @@ __global__ void find(InputIt first, * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality */ -template +template __global__ void contains(InputIt first, InputIt last, OutputIt output_begin, viewT* submap_views, uint32_t num_submaps, Hash hash, - KeyEqual key_equal) { + KeyEqual key_equal) +{ auto tid = blockDim.x * blockIdx.x + threadIdx.x; __shared__ bool writeBuffer[block_size]; - while(first + tid < last) { - auto key = *(first + tid); + while (first + tid < last) { + auto key = *(first + tid); auto found = false; - for(auto i = 0; i < num_submaps; ++i) { + for (auto i = 0; i < num_submaps; ++i) { found = submap_views[i].contains(key, hash, key_equal); - if(found) { - break; - } + if (found) { break; } } - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to + + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to * flush more frequently, causing increased sector stores from L2 to global memory. * By writing results to shared memory and then synchronizing before writing back * to global, we no longer rely on L1, preventing the increase in sector stores from @@ -390,10 +388,10 @@ __global__ void contains(InputIt first, /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * + * * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. - * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the - * contains operation for each key. This provides a significant boost in throughput compared + * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the + * contains operation for each key. This provides a significant boost in throughput compared * to the non Cooperative Group `contains` at moderate to high load factors. * * @tparam block_size The number of threads in the thread block @@ -401,7 +399,7 @@ __global__ void contains(InputIt first, * perform find operations * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is + * @tparam OutputIt Device accessible output iterator whose `value_type` is * convertible to the map's `mapped_type` * @tparam viewT Type of `static_map` device view * @tparam Hash Unary callable type @@ -415,49 +413,48 @@ __global__ void contains(InputIt first, * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality */ -template +template __global__ void contains(InputIt first, InputIt last, OutputIt output_begin, viewT* submap_views, uint32_t num_submaps, Hash hash, - KeyEqual key_equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = blockDim.x * blockIdx.x + threadIdx.x; + KeyEqual key_equal) +{ + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; auto key_idx = tid / tile_size; __shared__ bool writeBuffer[block_size]; - while(first + key_idx < last) { - auto key = *(first + key_idx); + while (first + key_idx < last) { + auto key = *(first + key_idx); auto found = false; - for(auto i = 0; i < num_submaps; ++i) { + for (auto i = 0; i < num_submaps; ++i) { found = submap_views[i].contains(tile, key, hash, key_equal); - if(found) { - break; - } + if (found) { break; } } - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to * flush more frequently, causing increased sector stores from L2 to global memory. * By writing results to shared memory and then synchronizing before writing back * to global, we no longer rely on L1, preventing the increase in sector stores from * L2 to global and improving performance. */ - if(tile.thread_rank() == 0) { - writeBuffer[threadIdx.x / tile_size] = found; - } + if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } __syncthreads(); - if(tile.thread_rank() == 0) { + if (tile.thread_rank() == 0) { *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; } key_idx += (gridDim.x * blockDim.x) / tile_size; } } -} // namespace detail -} // namespace cuco \ No newline at end of file +} // namespace detail +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 4da383ace..1719970a7 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -37,8 +37,8 @@ static_map::static_map(std::size_t capacity, auto constexpr stride = 4; auto const grid_size = (capacity_ + stride * block_size - 1) / (stride * block_size); detail::initialize - <<>>(slots_, empty_key_sentinel, empty_value_sentinel, - capacity_); + <<>>( + slots_, empty_key_sentinel, empty_value_sentinel, capacity_); } template @@ -50,11 +50,8 @@ static_map::~static_map() template template -void static_map::insert(InputIt first, - InputIt last, - Hash hash, - KeyEqual key_equal, - cudaStream_t stream) +void static_map::insert( + InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { auto num_keys = std::distance(first, last); if (num_keys == 0) { return; } @@ -70,14 +67,12 @@ void static_map::insert(InputIt first, CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream)); std::size_t h_num_successes; - detail::insert - <<>>(first, first + num_keys, num_successes_, - view, hash, key_equal); + detail::insert<<>>( + first, first + num_keys, num_successes_, view, hash, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( - &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, - stream)); - - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated + &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); + + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated size_ += h_num_successes; } @@ -121,9 +116,12 @@ void static_map::insert_if(InputIt first, template template -void static_map::find( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal, - cudaStream_t stream) +void static_map::find(InputIt first, + InputIt last, + OutputIt output_begin, + Hash hash, + KeyEqual key_equal, + cudaStream_t stream) { auto num_keys = std::distance(first, last); if (num_keys == 0) { return; } @@ -136,14 +134,16 @@ void static_map::find( detail::find <<>>(first, last, output_begin, view, hash, key_equal); - } template template -void static_map::contains( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal, - cudaStream_t stream) +void static_map::contains(InputIt first, + InputIt last, + OutputIt output_begin, + Hash hash, + KeyEqual key_equal, + cudaStream_t stream) { auto num_keys = std::distance(first, last); if (num_keys == 0) { return; } diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index d46a6e139..321b1f3da 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -195,7 +195,7 @@ class static_map { Key empty_key_sentinel, Value empty_value_sentinel, Allocator const& alloc = Allocator{}, - cudaStream_t stream = 0); + cudaStream_t stream = 0); /** * @brief Destroys the map and frees its contents. @@ -206,7 +206,7 @@ class static_map { /** * @brief Inserts all key/value pairs in the range `[first, last)`. * - * This function synchronizes `stream`. + * This function synchronizes `stream`. * * If multiple keys in `[first, last)` compare equal, it is unspecified which * element is inserted. @@ -224,8 +224,11 @@ class static_map { template , typename KeyEqual = thrust::equal_to> - void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); + void insert(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = 0); /** * @brief Inserts key/value pairs in the range `[first, last)` if `pred` @@ -289,12 +292,12 @@ class static_map { void find(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, cudaStream_t stream = 0); /** - * @brief Indicates whether the keys in the range + * @brief Indicates whether the keys in the range * `[first, last)` are contained in the map. * * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. @@ -319,8 +322,8 @@ class static_map { void contains(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, cudaStream_t stream = 0); private: diff --git a/tests/dynamic_map/dynamic_map_test.cu b/tests/dynamic_map/dynamic_map_test.cu index 3e4b94f02..fd38ea642 100644 --- a/tests/dynamic_map/dynamic_map_test.cu +++ b/tests/dynamic_map/dynamic_map_test.cu @@ -22,33 +22,30 @@ #include #include -enum class dist_type { - UNIQUE, - UNIFORM, - GAUSSIAN -}; - -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) { +enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; + +template +static void generate_keys(OutputIt output_begin, OutputIt output_end) +{ auto num_keys = std::distance(output_begin, output_end); std::random_device rd; std::mt19937 gen{rd()}; - switch(Dist) { + switch (Dist) { case dist_type::UNIQUE: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = i; } break; case dist_type::UNIFORM: - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(gen())); } break; case dist_type::GAUSSIAN: std::normal_distribution<> dg{1e9, 1e7}; - for(auto i = 0; i < num_keys; ++i) { + for (auto i = 0; i < num_keys; ++i) { output_begin[i] = std::abs(static_cast(dg(gen))); } break; @@ -78,12 +75,15 @@ bool none_of(Iterator begin, Iterator end, Predicate p) } } // namespace - -TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", +TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", + "", ((typename T, dist_type Dist), T, Dist), - (int32_t, dist_type::UNIQUE), (int64_t, dist_type::UNIQUE), - (int32_t, dist_type::UNIFORM), (int64_t, dist_type::UNIFORM), - (int32_t, dist_type::GAUSSIAN), (int64_t, dist_type::GAUSSIAN)) + (int32_t, dist_type::UNIQUE), + (int64_t, dist_type::UNIQUE), + (int32_t, dist_type::UNIFORM), + (int64_t, dist_type::UNIFORM), + (int32_t, dist_type::GAUSSIAN), + (int64_t, dist_type::GAUSSIAN)) { using Key = T; using Value = T; @@ -91,25 +91,25 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", constexpr std::size_t num_keys{50'000'000}; cuco::dynamic_map map{30'000'000, -1, -1}; - std::vector h_keys( num_keys ); - std::vector h_values( num_keys ); - std::vector> h_pairs ( num_keys ); + std::vector h_keys(num_keys); + std::vector h_values(num_keys); + std::vector> h_pairs(num_keys); generate_keys(h_keys.begin(), h_keys.end()); - for(auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_values[i] = val; - h_pairs[i].first = key; + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_values[i] = val; + h_pairs[i].first = key; h_pairs[i].second = val; } - thrust::device_vector d_keys( h_keys ); - thrust::device_vector d_values( h_values ); - thrust::device_vector> d_pairs( h_pairs ); - thrust::device_vector d_results( num_keys ); - thrust::device_vector d_contained( num_keys ); + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_values(h_values); + thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector d_results(num_keys); + thrust::device_vector d_contained(num_keys); // bulk function test cases SECTION("All inserted keys-value pairs should be correctly recovered during find") @@ -118,8 +118,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", map.find(d_keys.begin(), d_keys.end(), d_results.begin()); auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); - REQUIRE(all_of(zip, zip + num_keys, - [] __device__(auto const& p) { + REQUIRE(all_of(zip, zip + num_keys, [] __device__(auto const& p) { return thrust::get<0>(p) == thrust::get<1>(p); })); } @@ -128,7 +127,8 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", { map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - REQUIRE(all_of(d_results.begin(), d_results.end(), [] __device__(auto const& p) { return p == -1; })); + REQUIRE( + all_of(d_results.begin(), d_results.end(), [] __device__(auto const& p) { return p == -1; })); } SECTION("All inserted keys-value pairs should be contained") @@ -136,13 +136,15 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", map.insert(d_pairs.begin(), d_pairs.end()); map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - REQUIRE(all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + REQUIRE( + all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); } SECTION("Non-inserted keys-value pairs should not be contained") { map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - REQUIRE(none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); + REQUIRE( + none_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; })); } } \ No newline at end of file diff --git a/tests/static_map/static_map_test.cu b/tests/static_map/static_map_test.cu index 313936e95..622e3a80d 100644 --- a/tests/static_map/static_map_test.cu +++ b/tests/static_map/static_map_test.cu @@ -32,7 +32,7 @@ template bool all_of(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0) { auto size = thrust::distance(begin, end); - auto out = thrust::count_if(thrust::cuda::par.on(stream), begin, end, p); + auto out = thrust::count_if(thrust::cuda::par.on(stream), begin, end, p); cudaStreamSynchronize(stream); return size == out; } @@ -516,8 +516,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", cudaStreamCreate(&stream); constexpr std::size_t num_keys{500'000}; - cuco::static_map map{1'000'000, -1, -1, - cuco::cuda_allocator{}, stream}; + cuco::static_map map{1'000'000, -1, -1, cuco::cuda_allocator{}, stream}; auto m_view = map.get_device_mutable_view(); auto view = map.get_device_view(); @@ -542,7 +541,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", thrust::device_vector d_results(num_keys); thrust::device_vector d_contained(num_keys); - auto hash_fn = cuco::detail::MurmurHash3_32{}; + auto hash_fn = cuco::detail::MurmurHash3_32{}; auto equal_fn = thrust::equal_to{}; // bulk function test cases @@ -550,22 +549,22 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", { map.insert(d_pairs.begin(), d_pairs.end(), hash_fn, equal_fn, stream); map.find(d_keys.begin(), d_keys.end(), d_results.begin(), hash_fn, equal_fn, stream); - //cudaStreamSynchronize(stream); + // cudaStreamSynchronize(stream); auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); - REQUIRE(all_of(zip, zip + num_keys, [] __device__(auto const& p) { - return thrust::get<0>(p) == thrust::get<1>(p); - }, stream)); + REQUIRE(all_of( + zip, + zip + num_keys, + [] __device__(auto const& p) { return thrust::get<0>(p) == thrust::get<1>(p); }, + stream)); } SECTION("All inserted keys-value pairs should be contained") { map.insert(d_pairs.begin(), d_pairs.end(), hash_fn, equal_fn, stream); - map.contains(d_keys.begin(), d_keys.end(), d_contained.begin(), hash_fn, - equal_fn, stream); + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin(), hash_fn, equal_fn, stream); - REQUIRE( - all_of(d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; }, - stream)); + REQUIRE(all_of( + d_contained.begin(), d_contained.end(), [] __device__(bool const& b) { return b; }, stream)); } cudaStreamDestroy(stream); From de126590b996e97d46e59fbd7384085ee34ee67d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 10 Dec 2021 19:44:45 -0500 Subject: [PATCH 64/70] Non-shmem pair_retrieve take 4 output iterators --- .../static_multimap/device_view_impl.inl | 119 +++++++++++------- .../static_multimap/static_multimap.inl | 42 +++++-- include/cuco/static_multimap.cuh | 77 ++++++++---- tests/static_multimap/static_multimap_test.cu | 100 ++++++++------- 4 files changed, 208 insertions(+), 130 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index bb59f47b6..ca41f42a1 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1060,26 +1060,32 @@ class static_multimap::device_view_ * @brief Retrieves all the matches of a given pair contained in multimap using vector * loads without shared memory buffer. * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations in - * `[probe_output_begin, probe_output_begin + n)` and copies slot[j] to unspecified locations in - * `[contained_output_begin, contained_output_begin + n)`. It's users responsibility to ensure - * these locations are valid and no other threads will attempt to write to overlapping locations. - * If `p` does not have any matches, copies `p` and a pair of `empty_key_sentinel` and - * `empty_value_sentinel` into the output only if `is_outer` is true. + * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p.first` and `p.second` to + * unspecified locations started at `probe_key_begin` and `probe_val_begin`, and copies + * `slot[j].first` and `slot[j].second` to unspecified locations started at `contained_key_begin` + * and `contained_val_begin`. It's users responsibility to ensure these locations are valid and no + * other threads will attempt to write to overlapping locations. If `p` does not have any matches, + * copies `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output only + * if `is_outer` is true. * * @tparam is_outer Boolean flag indicating whether outer join is peformed * @tparam uses_vector_load Boolean flag indicating whether vector loads are used * @tparam ProbingCG Type of Cooperative Group used to retrieve * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. + * `pair`'s `Key` type. * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. + * `pair`'s `Value` type. + * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from + * the map's `key_type`. + * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from + * the map's `mapped_type`. * @tparam PairEqual Binary callable type * @param probing_cg The Cooperative Group used to retrieve * @param pair The pair to search for - * @param probe_output_begin Beginning of the output sequence of the matched probe pairs - * @param contained_output_begin Beginning of the output sequence of the matched contained - * pairs + * @param probe_key_begin Beginning of the output sequence of the matched probe keys + * @param probe_val_begin Beginning of the output sequence of the matched probe values + * @param contained_key_begin Beginning of the output sequence of the matched contained keys + * @param contained_val_begin Beginning of the output sequence of the matched contained values * @param pair_equal The binary callable used to compare two pairs for equality */ template ::device_view_ typename ProbingCG, typename OutputIt1, typename OutputIt2, + typename OutputIt3, + typename OutputIt4, typename PairEqual> __device__ __forceinline__ std::enable_if_t pair_retrieve( ProbingCG const& probing_cg, value_type const& pair, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, PairEqual pair_equal) noexcept { - using ProbePairType = typename thrust::iterator_traits::value_type; - using ContainedPairType = typename thrust::iterator_traits::value_type; - auto const lane_id = probing_cg.thread_rank(); auto current_slot = initial_slot(probing_cg, pair.first); [[maybe_unused]] auto found_match = false; @@ -1123,25 +1130,32 @@ class static_multimap::device_view_ auto const num_first_matches = __popc(first_exists); if (first_equals) { - auto lane_offset = detail::count_least_significant_bits(first_exists, lane_id); - *(probe_output_begin + num_matches + lane_offset) = ProbePairType{pair}; - *(contained_output_begin + num_matches + lane_offset) = ContainedPairType{arr[0]}; + auto lane_offset = detail::count_least_significant_bits(first_exists, lane_id); + auto const output_idx = num_matches + lane_offset; + + *(probe_key_begin + output_idx) = pair.first; + *(probe_val_begin + output_idx) = pair.second; + *(contained_key_begin + output_idx) = arr[0].first; + *(contained_val_begin + output_idx) = arr[0].second; } if (second_equals) { - auto lane_offset = detail::count_least_significant_bits(second_exists, lane_id); - *(probe_output_begin + num_matches + num_first_matches + lane_offset) = - ProbePairType{pair}; - *(contained_output_begin + num_matches + num_first_matches + lane_offset) = - ContainedPairType{arr[1]}; + auto const lane_offset = detail::count_least_significant_bits(second_exists, lane_id); + auto const output_idx = num_matches + num_first_matches + lane_offset; + + *(probe_key_begin + output_idx) = pair.first; + *(probe_val_begin + output_idx) = pair.second; + *(contained_key_begin + output_idx) = arr[1].first; + *(contained_val_begin + output_idx) = arr[1].second; } num_matches += (num_first_matches + __popc(second_exists)); } if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) { if constexpr (is_outer) { if ((not found_match) and lane_id == 0) { - *(probe_output_begin) = ProbePairType{pair}; - *(contained_output_begin) = ContainedPairType{cuco::make_pair( - this->get_empty_key_sentinel(), this->get_empty_value_sentinel())}; + *(probe_key_begin) = pair.first; + *(probe_val_begin) = pair.second; + *(contained_key_begin) = this->get_empty_key_sentinel(); + *(contained_val_begin) = this->get_empty_value_sentinel(); } } return; // exit if any slot in the current window is empty @@ -1155,26 +1169,32 @@ class static_multimap::device_view_ * @brief Retrieves all the matches of a given pair contained in multimap using scalar * loads without shared memory buffer. * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations in - * `[probe_output_begin, probe_output_begin + n)` and copies slot[j] to unspecified locations in - * `[contained_output_begin, contained_output_begin + n)`. It's users responsibility to ensure - * these locations are valid and no other threads will attempt to write to overlapping locations. - * If `p` does not have any matches, copies `p` and a pair of `empty_key_sentinel` and - * `empty_value_sentinel` into the output only if `is_outer` is true. + * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p.first` and `p.second` to + * unspecified locations started at `probe_key_begin` and `probe_val_begin`, and copies + * `slot[j].first` and `slot[j].second` to unspecified locations started at `contained_key_begin` + * and `contained_val_begin`. It's users responsibility to ensure these locations are valid and no + * other threads will attempt to write to overlapping locations. If `p` does not have any matches, + * copies `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output only + * if `is_outer` is true. * * @tparam is_outer Boolean flag indicating whether outer join is peformed * @tparam uses_vector_load Boolean flag indicating whether vector loads are used * @tparam ProbingCG Type of Cooperative Group used to retrieve * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. + * `pair`'s `Key` type. * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. + * `pair`'s `Value` type. + * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from + * the map's `key_type`. + * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from + * the map's `mapped_type`. * @tparam PairEqual Binary callable type * @param probing_cg The Cooperative Group used to retrieve * @param pair The pair to search for - * @param probe_output_begin Beginning of the output sequence of the matched probe pairs - * @param contained_output_begin Beginning of the output sequence of the matched contained - * pairs + * @param probe_key_begin Beginning of the output sequence of the matched probe keys + * @param probe_val_begin Beginning of the output sequence of the matched probe values + * @param contained_key_begin Beginning of the output sequence of the matched contained keys + * @param contained_val_begin Beginning of the output sequence of the matched contained values * @param pair_equal The binary callable used to compare two pairs for equality */ template ::device_view_ typename ProbingCG, typename OutputIt1, typename OutputIt2, + typename OutputIt3, + typename OutputIt4, typename PairEqual> __device__ __forceinline__ std::enable_if_t pair_retrieve( ProbingCG const& probing_cg, value_type const& pair, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, PairEqual pair_equal) noexcept { using ProbePairType = typename thrust::iterator_traits::value_type; @@ -1216,17 +1240,22 @@ class static_multimap::device_view_ if (equals) { auto const lane_offset = detail::count_least_significant_bits(exists, lane_id); - *(probe_output_begin + num_matches + lane_offset) = ProbePairType{pair}; - *(contained_output_begin + num_matches + lane_offset) = ContainedPairType{slot_contents}; + auto const output_idx = num_matches + lane_offset; + + *(probe_key_begin + output_idx) = pair.first; + *(probe_val_begin + output_idx) = pair.second; + *(contained_key_begin + output_idx) = slot_contents.first; + *(contained_val_begin + output_idx) = slot_contents.second; } num_matches += __popc(exists); } if (probing_cg.any(slot_is_empty)) { if constexpr (is_outer) { if ((not found_match) and lane_id == 0) { - *(probe_output_begin) = ProbePairType{pair}; - *(contained_output_begin) = ContainedPairType{cuco::make_pair( - this->get_empty_key_sentinel(), this->get_empty_value_sentinel())}; + *(probe_key_begin) = pair.first; + *(probe_val_begin) = pair.second; + *(contained_key_begin) = this->get_empty_key_sentinel(); + *(contained_val_begin) = this->get_empty_value_sentinel(); } } return; // exit if any slot in the current window is empty diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 1739efaca..f1f325e8c 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -691,18 +691,29 @@ template -template +template __device__ __forceinline__ void static_multimap::device_view::pair_retrieve( cooperative_groups::thread_block_tile const& probing_cg, value_type const& pair, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, PairEqual pair_equal) noexcept { constexpr bool is_outer = false; - impl_.pair_retrieve( - probing_cg, pair, probe_output_begin, contained_output_begin, pair_equal); + impl_.pair_retrieve(probing_cg, + pair, + probe_key_begin, + probe_val_begin, + contained_key_begin, + contained_val_begin, + pair_equal); } template -template +template __device__ __forceinline__ void static_multimap::device_view::pair_retrieve_outer( cooperative_groups::thread_block_tile const& probing_cg, value_type const& pair, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, PairEqual pair_equal) noexcept { constexpr bool is_outer = true; - impl_.pair_retrieve( - probing_cg, pair, probe_output_begin, contained_output_begin, pair_equal); + impl_.pair_retrieve(probing_cg, + pair, + probe_key_begin, + probe_val_begin, + contained_key_begin, + contained_val_begin, + pair_equal); } template + template __device__ __forceinline__ void pair_retrieve( cooperative_groups::thread_block_tile const& probing_cg, value_type const& pair, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, PairEqual pair_equal) noexcept; /** @@ -1039,31 +1050,43 @@ class static_multimap { * @brief Retrieves all the matches of a given pair contained in multimap without using shared * memory buffer * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations in - * `[probe_output_begin, probe_output_begin + n)` and copies slot[j] to unspecified locations in - * `[contained_output_begin, contained_output_begin + n)`. It's users responsibility to ensure - * these locations are valid and no other threads will attempt to write to overlapping - * locations. If `p` does not have any matches, copies `p` and a pair of `empty_key_sentinel` - * and `empty_value_sentinel` into the output. + * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p.first` and `p.second` to + * unspecified locations started at `probe_key_begin` and `probe_val_begin`, and copies + * `slot[j].first` and `slot[j].second` to unspecified locations started at + * `contained_key_begin` and `contained_val_begin`. It's users responsibility to ensure these + * locations are valid and no other threads will attempt to write to overlapping locations. If + * `p` does not have any matches, copies `p` and a pair of `empty_key_sentinel` and + * `empty_value_sentinel` into the output. * * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. + * `pair`'s `Key` type. * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. + * `pair`'s `Value` type. + * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from + * the map's `key_type`. + * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from + * the map's `mapped_type`. * @tparam PairEqual Binary callable type * @param probing_cg The Cooperative Group used to retrieve * @param pair The pair to search for - * @param probe_output_begin Beginning of the output sequence of the matched probe pairs - * @param contained_output_begin Beginning of the output sequence of the matched contained - * pairs + * @param probe_key_begin Beginning of the output sequence of the matched probe keys + * @param probe_val_begin Beginning of the output sequence of the matched probe values + * @param contained_key_begin Beginning of the output sequence of the matched contained keys + * @param contained_val_begin Beginning of the output sequence of the matched contained values * @param pair_equal The binary callable used to compare two pairs for equality */ - template + template __device__ __forceinline__ void pair_retrieve_outer( cooperative_groups::thread_block_tile const& probing_cg, value_type const& pair, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, PairEqual pair_equal) noexcept; /** diff --git a/tests/static_multimap/static_multimap_test.cu b/tests/static_multimap/static_multimap_test.cu index 34f316fd8..6a48c7b18 100644 --- a/tests/static_multimap/static_multimap_test.cu +++ b/tests/static_multimap/static_multimap_test.cu @@ -705,13 +705,17 @@ template __global__ void custom_pair_retrieve_outer(InputIt first, InputIt last, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, + OutputIt1 probe_key_begin, + OutputIt2 probe_val_begin, + OutputIt3 contained_key_begin, + OutputIt4 contained_val_begin, ScanIt scan_begin, viewT view, PairEqual pair_equal) @@ -723,8 +727,13 @@ __global__ void custom_pair_retrieve_outer(InputIt first, while (first + pair_idx < last) { auto const offset = *(scan_begin + pair_idx); auto const pair = *(first + pair_idx); - view.pair_retrieve_outer( - g, pair, probe_output_begin + offset, contained_output_begin + offset, pair_equal); + view.pair_retrieve_outer(g, + pair, + probe_key_begin + offset, + probe_val_begin + offset, + contained_key_begin + offset, + contained_val_begin + offset, + pair_equal); pair_idx += (gridDim.x * block_size) / cg_size; } } @@ -777,65 +786,60 @@ void test_non_shmem_pair_retrieve(Map& map, std::size_t const num_pairs) auto num = map.pair_count_outer(pair_begin, pair_begin + num_pairs, pair_equal{}); REQUIRE(num == gold_size); - thrust::device_vector> probe_pairs(gold_size); - thrust::device_vector> contained_pairs(gold_size); + thrust::device_vector probe_keys(gold_size); + thrust::device_vector probe_vals(gold_size); + thrust::device_vector contained_keys(gold_size); + thrust::device_vector contained_vals(gold_size); custom_pair_retrieve_outer <<>>(pair_begin, pair_begin + num_pairs, - probe_pairs.begin(), - contained_pairs.begin(), + probe_keys.begin(), + probe_vals.begin(), + contained_keys.begin(), + contained_vals.begin(), d_scan.begin(), view, pair_equal{}); // sort before compare - thrust::sort(thrust::device, - probe_pairs.begin(), - probe_pairs.end(), - [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { - if (lhs.first == rhs.first) { return lhs.second < rhs.second; } - return lhs.first < rhs.first; - }); - thrust::sort(thrust::device, - contained_pairs.begin(), - contained_pairs.end(), - [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { - if (lhs.first == rhs.first) { return lhs.second < rhs.second; } - return lhs.first < rhs.first; - }); + thrust::sort(thrust::device, probe_keys.begin(), probe_keys.end()); + thrust::sort(thrust::device, probe_vals.begin(), probe_vals.end()); + thrust::sort(thrust::device, contained_keys.begin(), contained_keys.end()); + thrust::sort(thrust::device, contained_vals.begin(), contained_vals.end()); // set gold references - auto gold_probe_pairs = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [num_pairs] __device__(auto i) { - if (i < num_pairs) { return cuco::pair{i / 2, i / 2}; } - auto val = i - (num_pairs / 2); - return cuco::pair{val, val}; - }); - auto gold_contained_pairs = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [num_pairs] __device__(auto i) { - if (i < num_pairs / 2) { return cuco::pair{-1, -1}; } - auto val = i - (num_pairs / 2); - return cuco::pair{val / 2, val}; - }); + auto gold_probe = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [num_pairs] __device__(auto i) { + if (i < num_pairs) { return i / 2; } + return i - (int(num_pairs) / 2); + }); + auto gold_contained_key = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [num_pairs] __device__(auto i) { + if (i < num_pairs / 2) { return -1; } + return (i - (int(num_pairs) / 2)) / 2; + }); + auto gold_contained_val = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [num_pairs] __device__(auto i) { + if (i < num_pairs / 2) { return -1; } + return i - (int(num_pairs) / 2); + }); REQUIRE( - thrust::equal(thrust::device, - probe_pairs.begin(), - probe_pairs.begin() + gold_size, - gold_probe_pairs, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); + thrust::equal(thrust::device, probe_keys.begin(), probe_keys.begin() + gold_size, gold_probe)); REQUIRE( - thrust::equal(thrust::device, - contained_pairs.begin(), - contained_pairs.begin() + gold_size, - gold_contained_pairs, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); + thrust::equal(thrust::device, probe_vals.begin(), probe_vals.begin() + gold_size, gold_probe)); + + REQUIRE(thrust::equal(thrust::device, + contained_keys.begin(), + contained_keys.begin() + gold_size, + gold_contained_key)); + + REQUIRE(thrust::equal(thrust::device, + contained_vals.begin(), + contained_vals.begin() + gold_size, + gold_contained_val)); } TEMPLATE_TEST_CASE_SIG("Tests of non-shared-memory pair_retrieve", From 1c836c721609595a93f6de5b468153d7aebdeac6 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 10 Dec 2021 20:00:25 -0500 Subject: [PATCH 65/70] Use const variable whenever possible in retrieve --- .../static_multimap/device_view_impl.inl | 46 ++++++++++--------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index ca41f42a1..c2a08f08e 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -915,8 +915,8 @@ class static_multimap::device_view_ if (first_exists or second_exists) { if constexpr (is_outer) { found_match = true; } - auto num_first_matches = __popc(first_exists); - auto num_second_matches = __popc(second_exists); + auto const num_first_matches = __popc(first_exists); + auto const num_second_matches = __popc(second_exists); uint32_t output_idx; if (0 == cg_lane_id) { @@ -925,14 +925,15 @@ class static_multimap::device_view_ output_idx = probing_cg.shfl(output_idx, 0); if (first_equals) { - auto lane_offset = detail::count_least_significant_bits(first_exists, cg_lane_id); - Key key = k; + auto const lane_offset = detail::count_least_significant_bits(first_exists, cg_lane_id); + Key key = k; output_buffer[output_idx + lane_offset] = cuco::make_pair(std::move(key), std::move(arr[0].second)); } if (second_equals) { - auto lane_offset = detail::count_least_significant_bits(second_exists, cg_lane_id); - Key key = k; + auto const lane_offset = + detail::count_least_significant_bits(second_exists, cg_lane_id); + Key key = k; output_buffer[output_idx + num_first_matches + lane_offset] = cuco::make_pair(std::move(key), std::move(arr[1].second)); } @@ -941,7 +942,7 @@ class static_multimap::device_view_ running = false; if constexpr (is_outer) { if ((not found_match) && (cg_lane_id == 0)) { - auto output_idx = atomicAdd(flushing_cg_counter, 1); + auto const output_idx = atomicAdd(flushing_cg_counter, 1); Key key = k; output_buffer[output_idx] = cuco::make_pair( std::move(key), std::move(this->get_empty_value_sentinel())); @@ -1016,17 +1017,18 @@ class static_multimap::device_view_ auto const slot_is_empty = detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel()); - auto const equals = (not slot_is_empty and key_equal(slot_contents.first, k)); + auto const equals = (not slot_is_empty and key_equal(slot_contents.first, k)); + auto const exists = g.ballot(equals); + uint32_t output_idx = *cg_counter; - auto const exists = g.ballot(equals); if (exists) { if constexpr (is_outer) { found_match = true; } - auto num_matches = __popc(exists); + auto const num_matches = __popc(exists); if (equals) { // Each match computes its lane-level offset - auto lane_offset = detail::count_least_significant_bits(exists, lane_id); - Key key = k; + auto const lane_offset = detail::count_least_significant_bits(exists, lane_id); + Key key = k; output_buffer[output_idx + lane_offset] = cuco::make_pair(std::move(key), std::move(slot_contents.second)); } @@ -1341,8 +1343,8 @@ class static_multimap::device_view_ if (first_exists or second_exists) { if constexpr (is_outer) { found_match = true; } - auto num_first_matches = __popc(first_exists); - auto num_second_matches = __popc(second_exists); + auto const num_first_matches = __popc(first_exists); + auto const num_second_matches = __popc(second_exists); uint32_t output_idx; if (0 == cg_lane_id) { @@ -1351,12 +1353,13 @@ class static_multimap::device_view_ output_idx = probing_cg.shfl(output_idx, 0); if (first_equals) { - auto lane_offset = detail::count_least_significant_bits(first_exists, cg_lane_id); + auto const lane_offset = detail::count_least_significant_bits(first_exists, cg_lane_id); probe_output_buffer[output_idx + lane_offset] = pair; contained_output_buffer[output_idx + lane_offset] = arr[0]; } if (second_equals) { - auto lane_offset = detail::count_least_significant_bits(second_exists, cg_lane_id); + auto const lane_offset = + detail::count_least_significant_bits(second_exists, cg_lane_id); probe_output_buffer[output_idx + num_first_matches + lane_offset] = pair; contained_output_buffer[output_idx + num_first_matches + lane_offset] = arr[1]; } @@ -1365,7 +1368,7 @@ class static_multimap::device_view_ running = false; if constexpr (is_outer) { if ((not found_match) && (cg_lane_id == 0)) { - auto output_idx = atomicAdd(flushing_cg_counter, 1); + auto const output_idx = atomicAdd(flushing_cg_counter, 1); probe_output_buffer[output_idx] = pair; contained_output_buffer[output_idx] = cuco::make_pair(std::move(this->get_empty_key_sentinel()), @@ -1456,16 +1459,17 @@ class static_multimap::device_view_ auto const slot_is_empty = detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel()); - auto const equals = (not slot_is_empty and pair_equal(slot_contents, pair)); + auto const equals = (not slot_is_empty and pair_equal(slot_contents, pair)); + auto const exists = g.ballot(equals); + uint32_t output_idx = *cg_counter; - auto const exists = g.ballot(equals); if (exists) { if constexpr (is_outer) { found_match = true; } - auto num_matches = __popc(exists); + auto const num_matches = __popc(exists); if (equals) { // Each match computes its lane-level offset - auto lane_offset = detail::count_least_significant_bits(exists, lane_id); + auto const lane_offset = detail::count_least_significant_bits(exists, lane_id); probe_output_buffer[output_idx + lane_offset] = pair; contained_output_buffer[output_idx + lane_offset] = slot_contents; } From b9ece83683fbb36e5a22ca62a2a88e8db299a4e1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 10 Dec 2021 18:30:36 -0800 Subject: [PATCH 66/70] Make sentinel getters part of public device_view API. --- .../static_multimap/device_view_impl.inl | 41 ++++++++++--------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index c4e520446..3a6afc5e1 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -24,6 +24,27 @@ template class static_multimap::device_view_impl_base { + public: + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + __host__ __device__ __forceinline__ Key get_empty_key_sentinel() const noexcept + { + return empty_key_sentinel_; + } + + /** + * @brief Gets the sentinel value used to represent an empty value slot. + * + * @return The sentinel value used to represent an empty value slot + */ + __host__ __device__ __forceinline__ Value get_empty_value_sentinel() const noexcept + { + return empty_value_sentinel_; + } + protected: // Import member type definitions from `static_multimap` using value_type = value_type; @@ -150,26 +171,6 @@ class static_multimap::device_view_ return probe_sequence_.get_capacity(); } - /** - * @brief Gets the sentinel value used to represent an empty key slot. - * - * @return The sentinel value used to represent an empty key slot - */ - __host__ __device__ __forceinline__ Key get_empty_key_sentinel() const noexcept - { - return empty_key_sentinel_; - } - - /** - * @brief Gets the sentinel value used to represent an empty value slot. - * - * @return The sentinel value used to represent an empty value slot - */ - __host__ __device__ __forceinline__ Value get_empty_value_sentinel() const noexcept - { - return empty_value_sentinel_; - } - /** * @brief Load two key/value pairs from the given slot to the target pair array. * From cf1a33753f616826516e0cf7cf91ff336ee5ce1c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 14 Dec 2021 13:18:19 -0500 Subject: [PATCH 67/70] Remove leftovers --- include/cuco/detail/static_multimap/device_view_impl.inl | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index c2a08f08e..72641eee1 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1216,9 +1216,6 @@ class static_multimap::device_view_ OutputIt4 contained_val_begin, PairEqual pair_equal) noexcept { - using ProbePairType = typename thrust::iterator_traits::value_type; - using ContainedPairType = typename thrust::iterator_traits::value_type; - auto const lane_id = probing_cg.thread_rank(); auto current_slot = initial_slot(probing_cg, pair.first); [[maybe_unused]] auto found_match = false; From 809a5f4b614a805e5d9bd1f8f2e3183863311c89 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 14 Dec 2021 13:30:26 -0500 Subject: [PATCH 68/70] Update docs --- .../static_multimap/device_view_impl.inl | 36 +++++++++---------- include/cuco/static_multimap.cuh | 32 ++++++++--------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 72641eee1..a6154027a 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1059,16 +1059,16 @@ class static_multimap::device_view_ } /** - * @brief Retrieves all the matches of a given pair contained in multimap using vector - * loads without shared memory buffer. + * @brief Retrieves all the matches of a given pair using vector loads. * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p.first` and `p.second` to - * unspecified locations started at `probe_key_begin` and `probe_val_begin`, and copies - * `slot[j].first` and `slot[j].second` to unspecified locations started at `contained_key_begin` - * and `contained_val_begin`. It's users responsibility to ensure these locations are valid and no - * other threads will attempt to write to overlapping locations. If `p` does not have any matches, - * copies `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output only - * if `is_outer` is true. + * For pair `p` with `n = pair_count(cg, p, pair_equal)` matching pairs, if `pair_equal(p, + * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, + * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an + * unspecified value of `j` where `0 <= j < n`. Concurrent reads or writes to any of the output + * ranges results in undefined behavior. Behavior is undefined if the extent of any of the + * output ranges is less than `n`. If `p` does not have any matches, stores `probe_key_begin[0] + * = p.first`, `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, + * and `contained_val_begin[0] = empty_value_sentinel` only if `is_outer` is true. * * @tparam is_outer Boolean flag indicating whether outer join is peformed * @tparam uses_vector_load Boolean flag indicating whether vector loads are used @@ -1168,16 +1168,16 @@ class static_multimap::device_view_ } /** - * @brief Retrieves all the matches of a given pair contained in multimap using scalar - * loads without shared memory buffer. + * @brief Retrieves all the matches of a given pair using scalar loads. * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p.first` and `p.second` to - * unspecified locations started at `probe_key_begin` and `probe_val_begin`, and copies - * `slot[j].first` and `slot[j].second` to unspecified locations started at `contained_key_begin` - * and `contained_val_begin`. It's users responsibility to ensure these locations are valid and no - * other threads will attempt to write to overlapping locations. If `p` does not have any matches, - * copies `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output only - * if `is_outer` is true. + * For pair `p` with `n = pair_count(cg, p, pair_equal)` matching pairs, if `pair_equal(p, + * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, + * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an + * unspecified value of `j` where `0 <= j < n`. Concurrent reads or writes to any of the output + * ranges results in undefined behavior. Behavior is undefined if the extent of any of the + * output ranges is less than `n`. If `p` does not have any matches, stores `probe_key_begin[0] + * = p.first`, `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, + * and `contained_val_begin[0] = empty_value_sentinel` only if `is_outer` is true. * * @tparam is_outer Boolean flag indicating whether outer join is peformed * @tparam uses_vector_load Boolean flag indicating whether vector loads are used diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index e47a1a427..7e92056fb 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -960,14 +960,14 @@ class static_multimap { KeyEqual key_equal = KeyEqual{}) noexcept; /** - * @brief Retrieves all the matches of a given pair contained in multimap without using shared - * memory buffer + * @brief Retrieves all the matches of a given pair * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p.first` and `p.second` to - * unspecified locations started at `probe_key_begin` and `probe_val_begin`, and copies - * `slot[j].first` and `slot[j].second` to unspecified locations started at - * `contained_key_begin` and `contained_val_begin`. It's users responsibility to ensure these - * locations are valid and no other threads will attempt to write to overlapping locations. + * For pair `p` with `n = pair_count(cg, p, pair_equal)` matching pairs, if `pair_equal(p, + * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, + * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an + * unspecified value of `j` where `0 <= j < n`. Concurrent reads or writes to any of the output + * ranges results in undefined behavior. Behavior is undefined if the extent of any of the + * output ranges is less than `n`. * * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from * `pair`'s `Key` type. @@ -1047,16 +1047,16 @@ class static_multimap { PairEqual pair_equal) noexcept; /** - * @brief Retrieves all the matches of a given pair contained in multimap without using shared - * memory buffer + * @brief Retrieves all the matches of a given pair * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p.first` and `p.second` to - * unspecified locations started at `probe_key_begin` and `probe_val_begin`, and copies - * `slot[j].first` and `slot[j].second` to unspecified locations started at - * `contained_key_begin` and `contained_val_begin`. It's users responsibility to ensure these - * locations are valid and no other threads will attempt to write to overlapping locations. If - * `p` does not have any matches, copies `p` and a pair of `empty_key_sentinel` and - * `empty_value_sentinel` into the output. + * For pair `p` with `n = pair_count(cg, p, pair_equal)` matching pairs, if `pair_equal(p, + * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, + * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an + * unspecified value of `j` where `0 <= j < n`. Concurrent reads or writes to any of the output + * ranges results in undefined behavior. Behavior is undefined if the extent of any of the + * output ranges is less than `n`. If `p` does not have any matches, stores `probe_key_begin[0] + * = p.first`, `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, + * and `contained_val_begin[0] = empty_value_sentinel`. * * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from * `pair`'s `Key` type. From 327435e4a529a803016d4498db641cdd33bd676b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 14 Dec 2021 17:10:59 -0500 Subject: [PATCH 69/70] Update docs --- .../static_multimap/device_view_impl.inl | 36 ++++++++++--------- include/cuco/static_multimap.cuh | 22 +++++++----- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index a6154027a..04771d3d8 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1061,14 +1061,16 @@ class static_multimap::device_view_ /** * @brief Retrieves all the matches of a given pair using vector loads. * - * For pair `p` with `n = pair_count(cg, p, pair_equal)` matching pairs, if `pair_equal(p, - * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, - * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an - * unspecified value of `j` where `0 <= j < n`. Concurrent reads or writes to any of the output - * ranges results in undefined behavior. Behavior is undefined if the extent of any of the - * output ranges is less than `n`. If `p` does not have any matches, stores `probe_key_begin[0] - * = p.first`, `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, - * and `contained_val_begin[0] = empty_value_sentinel` only if `is_outer` is true. + * For pair `p` with `n` matching pairs, if `pair_equal(p, slot)` returns true, stores + * `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, `contained_key_begin[j] = + * slot.first`, and `contained_val_begin[j] = slot.second` for an unspecified value of `j` where + * `0 <= j < n`. If `p` does not have any matches, stores `probe_key_begin[0] = p.first`, + * `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, and + * `contained_val_begin[0] = empty_value_sentinel` only if `is_outer` is true. + * + * Concurrent reads or writes to any of the output ranges results in undefined behavior. + * + * Behavior is undefined if the extent of any of the output ranges is less than `n`. * * @tparam is_outer Boolean flag indicating whether outer join is peformed * @tparam uses_vector_load Boolean flag indicating whether vector loads are used @@ -1170,14 +1172,16 @@ class static_multimap::device_view_ /** * @brief Retrieves all the matches of a given pair using scalar loads. * - * For pair `p` with `n = pair_count(cg, p, pair_equal)` matching pairs, if `pair_equal(p, - * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, - * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an - * unspecified value of `j` where `0 <= j < n`. Concurrent reads or writes to any of the output - * ranges results in undefined behavior. Behavior is undefined if the extent of any of the - * output ranges is less than `n`. If `p` does not have any matches, stores `probe_key_begin[0] - * = p.first`, `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, - * and `contained_val_begin[0] = empty_value_sentinel` only if `is_outer` is true. + * For pair `p` with `n` matching pairs, if `pair_equal(p, slot)` returns true, stores + * `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, `contained_key_begin[j] = + * slot.first`, and `contained_val_begin[j] = slot.second` for an unspecified value of `j` where + * `0 <= j < n`. If `p` does not have any matches, stores `probe_key_begin[0] = p.first`, + * `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, and + * `contained_val_begin[0] = empty_value_sentinel` only if `is_outer` is true. + * + * Concurrent reads or writes to any of the output ranges results in undefined behavior. + * + * Behavior is undefined if the extent of any of the output ranges is less than `n`. * * @tparam is_outer Boolean flag indicating whether outer join is peformed * @tparam uses_vector_load Boolean flag indicating whether vector loads are used diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 7e92056fb..f820b0a05 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -965,9 +965,11 @@ class static_multimap { * For pair `p` with `n = pair_count(cg, p, pair_equal)` matching pairs, if `pair_equal(p, * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an - * unspecified value of `j` where `0 <= j < n`. Concurrent reads or writes to any of the output - * ranges results in undefined behavior. Behavior is undefined if the extent of any of the - * output ranges is less than `n`. + * unspecified value of `j` where `0 <= j < n`. + * + * Concurrent reads or writes to any of the output ranges results in undefined behavior. + * + * Behavior is undefined if the extent of any of the output ranges is less than `n`. * * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from * `pair`'s `Key` type. @@ -1049,14 +1051,16 @@ class static_multimap { /** * @brief Retrieves all the matches of a given pair * - * For pair `p` with `n = pair_count(cg, p, pair_equal)` matching pairs, if `pair_equal(p, + * For pair `p` with `n = pair_count_outer(cg, p, pair_equal)` matching pairs, if `pair_equal(p, * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an - * unspecified value of `j` where `0 <= j < n`. Concurrent reads or writes to any of the output - * ranges results in undefined behavior. Behavior is undefined if the extent of any of the - * output ranges is less than `n`. If `p` does not have any matches, stores `probe_key_begin[0] - * = p.first`, `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, - * and `contained_val_begin[0] = empty_value_sentinel`. + * unspecified value of `j` where `0 <= j < n`. If `p` does not have any matches, stores + * `probe_key_begin[0] = p.first`, `probe_val_begin[0] = p.second`, `contained_key_begin[0] = + * empty_key_sentinel`, and `contained_val_begin[0] = empty_value_sentinel`. + * + * Concurrent reads or writes to any of the output ranges results in undefined behavior. + * + * Behavior is undefined if the extent of any of the output ranges is less than `n`. * * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from * `pair`'s `Key` type. From 8187626065c5df9ab3072c2e566c3ae0ab992643 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 14 Dec 2021 13:01:02 -0500 Subject: [PATCH 70/70] Fix a bug: expose public device APIs --- include/cuco/detail/probe_sequence_impl.cuh | 2 + .../static_multimap/device_view_impl.inl | 102 ++++++------ include/cuco/static_multimap.cuh | 155 ++++++++++-------- 3 files changed, 143 insertions(+), 116 deletions(-) diff --git a/include/cuco/detail/probe_sequence_impl.cuh b/include/cuco/detail/probe_sequence_impl.cuh index 3145e8fa5..549d59df3 100644 --- a/include/cuco/detail/probe_sequence_impl.cuh +++ b/include/cuco/detail/probe_sequence_impl.cuh @@ -108,6 +108,7 @@ class probe_sequence_impl_base { { } + public: /** * @brief Returns the capacity of the hash map. */ @@ -126,6 +127,7 @@ class probe_sequence_impl_base { */ __device__ __forceinline__ const_iterator get_slots() const noexcept { return slots_; } + protected: iterator slots_; ///< Pointer to beginning of the hash map slots const std::size_t capacity_; ///< Total number of slots }; // class probe_sequence_impl_base diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 3a6afc5e1..5737b7af8 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -24,27 +24,6 @@ template class static_multimap::device_view_impl_base { - public: - /** - * @brief Gets the sentinel value used to represent an empty key slot. - * - * @return The sentinel value used to represent an empty key slot - */ - __host__ __device__ __forceinline__ Key get_empty_key_sentinel() const noexcept - { - return empty_key_sentinel_; - } - - /** - * @brief Gets the sentinel value used to represent an empty value slot. - * - * @return The sentinel value used to represent an empty value slot - */ - __host__ __device__ __forceinline__ Value get_empty_value_sentinel() const noexcept - { - return empty_value_sentinel_; - } - protected: // Import member type definitions from `static_multimap` using value_type = value_type; @@ -81,26 +60,6 @@ class static_multimap::device_view_ { } - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __device__ __forceinline__ pair_atomic_type* get_slots() noexcept - { - return probe_sequence_.get_slots(); - } - - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __device__ __forceinline__ pair_atomic_type const* get_slots() const noexcept - { - return probe_sequence_.get_slots(); - } - /** * @brief Returns the initial slot for a given key `k` * @@ -161,16 +120,6 @@ class static_multimap::device_view_ return probe_sequence_.next_slot(s); } - /** - * @brief Gets the maximum number of elements the hash map can hold. - * - * @return The maximum number of elements the hash map can hold - */ - __host__ __device__ __forceinline__ std::size_t get_capacity() const noexcept - { - return probe_sequence_.get_capacity(); - } - /** * @brief Load two key/value pairs from the given slot to the target pair array. * @@ -189,6 +138,57 @@ class static_multimap::device_view_ } } + public: + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + __host__ __device__ __forceinline__ Key get_empty_key_sentinel() const noexcept + { + return empty_key_sentinel_; + } + + /** + * @brief Gets the sentinel value used to represent an empty value slot. + * + * @return The sentinel value used to represent an empty value slot + */ + __host__ __device__ __forceinline__ Value get_empty_value_sentinel() const noexcept + { + return empty_value_sentinel_; + } + + /** + * @brief Gets slots array. + * + * @return Slots array + */ + __device__ __forceinline__ pair_atomic_type* get_slots() noexcept + { + return probe_sequence_.get_slots(); + } + + /** + * @brief Gets slots array. + * + * @return Slots array + */ + __device__ __forceinline__ pair_atomic_type const* get_slots() const noexcept + { + return probe_sequence_.get_slots(); + } + + /** + * @brief Gets the maximum number of elements the hash map can hold. + * + * @return The maximum number of elements the hash map can hold + */ + __host__ __device__ __forceinline__ std::size_t get_capacity() const noexcept + { + return probe_sequence_.get_capacity(); + } + private: probe_sequence_type probe_sequence_; ///< Probe sequence used to probe the hash map Key empty_key_sentinel_{}; ///< Key value that represents an empty slot diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 16b3ecd21..ab9135144 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -559,6 +559,78 @@ class static_multimap { class device_mutable_view_impl; class device_view_impl; + template + class device_view_base { + protected: + // Import member type definitions from `static_multimap` + using value_type = value_type; + using key_type = Key; + using mapped_type = Value; + using pair_atomic_type = pair_atomic_type; + using iterator = pair_atomic_type*; + using const_iterator = pair_atomic_type const*; + using probe_sequence_type = probe_sequence_type; + + __host__ __device__ device_view_base(pair_atomic_type* slots, + std::size_t capacity, + Key empty_key_sentinel, + Value empty_value_sentinel) noexcept + : impl_{slots, capacity, empty_key_sentinel, empty_value_sentinel} + { + } + + public: + /** + * @brief Gets slots array. + * + * @return Slots array + */ + __device__ __forceinline__ pair_atomic_type* get_slots() noexcept { return impl_.get_slots(); } + + /** + * @brief Gets slots array. + * + * @return Slots array + */ + __device__ __forceinline__ pair_atomic_type const* get_slots() const noexcept + { + return impl_.get_slots(); + } + + /** + * @brief Gets the maximum number of elements the hash map can hold. + * + * @return The maximum number of elements the hash map can hold + */ + __host__ __device__ __forceinline__ std::size_t get_capacity() const noexcept + { + return impl_.get_capacity(); + } + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + __host__ __device__ __forceinline__ Key get_empty_key_sentinel() const noexcept + { + return impl_.get_empty_key_sentinel(); + } + + /** + * @brief Gets the sentinel value used to represent an empty value slot. + * + * @return The sentinel value used to represent an empty value slot + */ + __host__ __device__ __forceinline__ Value get_empty_value_sentinel() const noexcept + { + return impl_.get_empty_value_sentinel(); + } + + protected: + ViewImpl impl_; + }; // class device_view_base + public: /** * @brief Mutable, non-owning view-type that may be used in device code to @@ -580,14 +652,14 @@ class static_multimap { * }); * \endcode */ - class device_mutable_view { + class device_mutable_view : public device_view_base { public: - // Import member type definitions from `static_multimap` - using value_type = value_type; - using key_type = Key; - using mapped_type = Value; - using iterator = pair_atomic_type*; - using const_iterator = pair_atomic_type const*; + using view_base_type = device_view_base; + using value_type = typename view_base_type::value_type; + using key_type = typename view_base_type::key_type; + using mapped_type = typename view_base_type::mapped_type; + using iterator = typename view_base_type::iterator; + using const_iterator = typename view_base_type::const_iterator; /** * @brief Construct a mutable view of the first `capacity` slots of the @@ -604,7 +676,7 @@ class static_multimap { std::size_t capacity, Key empty_key_sentinel, Value empty_value_sentinel) noexcept - : impl_{slots, capacity, empty_key_sentinel, empty_value_sentinel} + : view_base_type{slots, capacity, empty_key_sentinel, empty_value_sentinel} { } @@ -620,7 +692,7 @@ class static_multimap { value_type const& insert_pair) noexcept; private: - device_mutable_view_impl impl_; + using device_view_base::impl_; }; // class device mutable view /** @@ -631,14 +703,14 @@ class static_multimap { * value. * */ - class device_view { + class device_view : public device_view_base { public: - // Import member type definitions from `static_multimap` - using value_type = value_type; - using key_type = Key; - using mapped_type = Value; - using iterator = pair_atomic_type*; - using const_iterator = pair_atomic_type const*; + using view_base_type = device_view_base; + using value_type = typename view_base_type::value_type; + using key_type = typename view_base_type::key_type; + using mapped_type = typename view_base_type::mapped_type; + using iterator = typename view_base_type::iterator; + using const_iterator = typename view_base_type::const_iterator; /** * @brief Construct a view of the first `capacity` slots of the @@ -655,55 +727,8 @@ class static_multimap { std::size_t capacity, Key empty_key_sentinel, Value empty_value_sentinel) noexcept - : impl_{slots, capacity, empty_key_sentinel, empty_value_sentinel} - { - } - - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __device__ __forceinline__ pair_atomic_type* get_slots() noexcept { return impl_.get_slots(); } - - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __device__ __forceinline__ pair_atomic_type const* get_slots() const noexcept - { - return impl_.get_slots(); - } - - /** - * @brief Gets the maximum number of elements the hash map can hold. - * - * @return The maximum number of elements the hash map can hold - */ - __host__ __device__ __forceinline__ std::size_t get_capacity() const noexcept - { - return impl_.get_capacity(); - } - - /** - * @brief Gets the sentinel value used to represent an empty key slot. - * - * @return The sentinel value used to represent an empty key slot - */ - __host__ __device__ __forceinline__ Key get_empty_key_sentinel() const noexcept + : view_base_type{slots, capacity, empty_key_sentinel, empty_value_sentinel} { - return impl_.get_empty_key_sentinel(); - } - - /** - * @brief Gets the sentinel value used to represent an empty value slot. - * - * @return The sentinel value used to represent an empty value slot - */ - __host__ __device__ __forceinline__ Value get_empty_value_sentinel() const noexcept - { - return impl_.get_empty_value_sentinel(); } /** @@ -1053,7 +1078,7 @@ class static_multimap { PairEqual pair_equal) noexcept; private: - device_view_impl impl_; + using device_view_base::impl_; }; // class device_view /**