diff --git a/README.md b/README.md index a11f5afdb..bbc6fc18b 100644 --- a/README.md +++ b/README.md @@ -254,4 +254,4 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection `cuco::bloom_filter` implements a Blocked Bloom Filter for approximate set membership queries. #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVm1v20YM_iuE9mF2I78FCwo4L4CXpJuxwsnstEUxD8HpRFuHynfavdgxgvz38U6SLSduMSwBEptHPnz4kEfpOTJojFDSRMO_niORRsNBHOVMLh1bYjSMuEtZFEdGOc399967uYR3cK2KrRbLzEKLt-G0f_pLDJPP45vxCK7vpvd309HD-G7S9b7B_6PgKA2m4GSKGmyGMCoYp3_VSQyfUXsicNrtQ8s7zKPqbB61zwPKVjlYsS1IZcEZJBhhYCFyBHziWFgQErhaFblgkiNshM1Cqgon0IGvFYhKLCN_RhEFfVs0PYHZHXX_k1lbDHu9zWbTZYF2V-llLy-dTe_j-Pp2MrvtEPVd2CeZk7Kg8R8nNBWebIEVxIyzhPjmbANKA1tqpDOrPPONFlbIZQxGLeyGaQw4qTBWi8TZA_FqnlR_04HkY5KEG81gPJtH8OtoNp7FAefL-OH3u08P8GU0nY4mD-PbGdxNqVmTm7FvFX37AKPJV_hjPLmJAUk6SoVPhfZVEFXhZcW01HCGeEBjoUpapkAuFoJDPUGwVGvUksqCAvVKlLNGJNOAk4uVsMwG25viQqreXM7lT0Ly3KUIF9xx1UtypVaP1HeLustddnXoYzPtjO1x5aTt-sM3RymuKcXjGrlV-rgLPiF3nthjoahp2-NehrqLNGrd1xyEoqYgWwWzkJYmTsjWWom0PZfPVBj0evAbStTMIgz6P_f7ffiG2yANDYNBbYMWC6GNhbNwTjiqMobSPYzH5iSf9a0C6VaPAeWywjw_7mMLGurLvfs76HfPvucrD307FcK5Ly3UMSvYxl-kklZ57U6Jr3FJp7SZwNU3bzhsdu-C0l1Vcc8U8lKBlvIOhweNKp09h1ZNJuyFnXfdjZY_6ia4JMnbcYjookz950G7ysCc17J4DF5UXjPkvOlAgaH-ne_Jvv7aS-5gyoCDoxpgT2Ov3Ph1ozOWL-pdFAK8Y9VulqatmkVcZWr_ULBEqfzKe9IldrltlcRjWLDc4KF2RwNlM_Cg-Q2MqpQ_HeptYzp3O4E2yFooZ_JtNde0p3alUeCDX-ImUy5PoUwHYS1b7bBTKENLcY0Q7gkJ83A_vRw0VaFh9XvcvJEm3hdeN_a_0A1l2oxZoA0cnjR-BaIMrF5dPj_7wj9pwkxXJyYEFlolLKfVTMswZZYB7QPHrSOsuAFToeBTJhJhjb9EXtdXdX-4n171IcWCyvKrVJVMqBUJMSdVQqyoa2tU5bfJoWvGTIbGPz5Tv5J9vcfllDs5ZSWn_I6ci1yRXF5tT_bSmypjqx6vsIxbh8N2pEFNU3Vj_Ri029CrAMvxK2e3zLv4H3lfV9I0_SCvrCs2Ng3YFi4u6HHrh5KetvS5FiGYfd8q-2JvD7GUJK-wNNJUSPCr-oXetPz7C73R6P0LWSTXnA9Oz9yAjlVhy7e1qENAl_zkZPAeOkzz7NKsHt_3odOhzW3pj6UcmHZytkrCK1wukgYm5zwn47p86SID1Su_RS9xfU67-uCctIte_g6__wINDYAL)) \ No newline at end of file +- [Host-bulk APIs (Default fingerprinting policy)](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVmtvGjkU_StXsx8WmuEVbVUJQiSapLtoK5IF2qpaVsjj8TBWBnvqBwRF-e977ZmBgZBqtVRqwL6Pc889vvZzoJnWXAod9P9-Dngc9HthkBGxsmTFgn5AbUyCMNDSKup-d94tBLyDG5nvFF-lBhq0CZfdy99CmHwd345HcHM_fbifjubj-0nb2Xr7z5wyoVkMVsRMgUkZjHJC8U-5E8JXphwQuGx3oeEMFkG5twiaAx9lJy2syQ6ENGA1wzBcQ8IzBuyJstwAF0DlOs84EZTBlpvUpyrjeDjwvQwiI0PQnqBHjr-SuiUQs4fuPqkxeb_T2W63beJht6VadbLCWHc-j2_uJrO7FkLfu30RGTILiv2wXGHh0Q5IjsgoiRBvRrYgFZCVYrhnpEO-VdxwsQpBy8RsiWI-Tsy1UTyy5oi8CifWXzdA-ohA4kYzGM8WAXwczcaz0Mf5Np7_cf9lDt9G0-loMh_fzeB-is2a3I5dq_DXJxhNvsOf48ltCAypw1TsKVeuCoTKHa0sLjicMXYEI5EFLJ0zyhNOoVIQrOSGKYFlQc7UmhdaQ5Cxj5PxNTfE-LVXxflUnYVYiF-4oJmNGVxRS2UnyqRcL7Hvhqk2ten1sY1JldWmQ6UVpu02X23FbIMplhtGjVTnTdgTo9YBW-YSm7Y7b6Wxuwyl1j7FwCU2hZG1X-bCoOK4aGwkj5sL8YyFgVukWLdxHIOw6-Uj22kntiH0ur92u90B7D-dTucKfmeCKWJYuQ3O_nwkkxduw0Pcd9Btvx-UkcbIrjKe64QrbSAlWeLjuWCy3PD0vpFAvErQKjMPXntoG5Xd0uhx6SvzOPCr22xVmygR8M0tszvynKmDBmaXF0I7tgCcBE5eaLL0JkOXfbB3neVk60553auYDDFLiM0MFA12mjwF5Kt3kuv365q7qnJdl_GeaxW-lKm1ift91KCBqys8kR9t9ojAPO-e47fxJFgOU7lyJK6Knru5WMDsY6yFgNoHw_tcTMRZmbtQZ79_pPMaatewRtU5P1v3LpWiG26rHbEVyrYZeo82ZnDfe80yDbFOK_nSWyHvdZdB3QAdvVj2thd1sRRWYh-mcDjaqgIcYBwafF7M5Tz3Ds6wlDOJ40aFIiwzNX_KWiRldu0scRBidxoF8BASkml2zN1ZR1F3PDoptRhlKX9Zpna107efqziFN1xane1KDeGs35eGjnN3EepU2iyGIh34q80oy1q51HixbBj44YHEzB-mw16dFTyq7i7Ur6gJD4VXjf0vcH2ZJiUG8Bbzt7W7RpjwqE6Gizui3N3W9QOhvWOuZEQyvN7wQomJIahzZamxGCushSmjsKeUR9yga8HrSd2fHqbXXTxjOZbl5oUskGArIkSOrHhfXtVWq8oNhmPTlOiUafcEif3MwnrP0yn2dIqSTvEGnUkmkS7HtgM7LI64X2xU8vIXWuNYbGcaVF8qT6yTQbMJnTJgIb9Cu0Xe5H_kPa2kvvSTvKJ5fkA6UeKLBb9XJPhl17dyPTmsnw48xVAVArr48wVfq-4NiK9CdXjUBmJDae_yve3htsxN8eINWhhoSC8ueh-gRRRNh3q9_NCFVgvvLYP_GczB4lZG1pF_Bmc8qsWklGa4uCkerriA9YrH4CWs9vHmONpH7oKXf_y_fwHeCexw)) \ No newline at end of file diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu index 00e2de775..8e4d0b762 100644 --- a/benchmarks/bloom_filter/add_bench.cu +++ b/benchmarks/bloom_filter/add_bench.cu @@ -41,9 +41,9 @@ template , Dist>) { - using policy_type = cuco::bloom_filter_policy, - Word, - static_cast(WordsPerBlock)>; + using policy_type = cuco::default_filter_policy, + Word, + static_cast(WordsPerBlock)>; using filter_type = cuco::bloom_filter, cuda::thread_scope_device, policy_type>; @@ -83,6 +83,51 @@ void bloom_filter_add(nvbench::state& state, }); } +/** + * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance with + * `arrow_filter_policy` + */ +template +void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list) +{ + using policy_type = cuco::arrow_filter_policy; + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, policy_type>; + + auto const num_keys = state.get_int64("NumInputs"); + auto const filter_size_mb = state.get_int64("FilterSizeMB"); + + std::size_t const num_sub_filters = + (filter_size_mb * 1024 * 1024) / + (sizeof(typename filter_type::word_type) * filter_type::words_per_block); + + if (num_sub_filters > policy_type::max_filter_blocks) { + state.skip("bloom filter with arrow policy should have <= 4194304 blocks"); // skip invalid + // configurations + } + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + state.add_element_count(num_keys); + + filter_type filter{num_sub_filters}; + + state.collect_dram_throughput(); + state.collect_l1_hit_rates(); + state.collect_l2_hit_rates(); + state.collect_loads_efficiency(); + state.collect_stores_efficiency(); + + add_fpr_summary(state, filter); + + state.exec([&](nvbench::launch& launch) { + filter.add_async(keys.begin(), keys.end(), {launch.get_stream()}); + }); +} + NVBENCH_BENCH_TYPES(bloom_filter_add, NVBENCH_TYPE_AXES(nvbench::type_list, nvbench::type_list, @@ -118,3 +163,12 @@ NVBENCH_BENCH_TYPES(bloom_filter_add, .set_max_noise(defaults::MAX_NOISE) .add_int64_axis("NumInputs", {defaults::BF_N}) .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}); + +NVBENCH_BENCH_TYPES(arrow_bloom_filter_add, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list)) + .set_name("arrow_bloom_filter_add_unique_size") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("NumInputs", {defaults::BF_N}) + .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE); \ No newline at end of file diff --git a/benchmarks/bloom_filter/contains_bench.cu b/benchmarks/bloom_filter/contains_bench.cu index 67ba80d95..bf81f5d83 100644 --- a/benchmarks/bloom_filter/contains_bench.cu +++ b/benchmarks/bloom_filter/contains_bench.cu @@ -43,9 +43,9 @@ void bloom_filter_contains( { // cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if // filter block fits into a 32B sector - using policy_type = cuco::bloom_filter_policy, - Word, - static_cast(WordsPerBlock)>; + using policy_type = cuco::default_filter_policy, + Word, + static_cast(WordsPerBlock)>; using filter_type = cuco::bloom_filter, cuda::thread_scope_device, policy_type>; @@ -88,6 +88,56 @@ void bloom_filter_contains( }); } +/** + * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance with + * `arrow_filter_policy` + */ +template +void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list) +{ + // cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if + // filter block fits into a 32B sector + using policy_type = cuco::arrow_filter_policy; + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, policy_type>; + + auto const num_keys = state.get_int64("NumInputs"); + auto const filter_size_mb = state.get_int64("FilterSizeMB"); + + std::size_t const num_sub_filters = + (filter_size_mb * 1024 * 1024) / + (sizeof(typename filter_type::word_type) * filter_type::words_per_block); + + if (num_sub_filters > policy_type::max_filter_blocks) { + state.skip("bloom filter with arrow policy should have <= 4194304 blocks"); // skip invalid + // configurations + } + + thrust::device_vector keys(num_keys); + thrust::device_vector result(num_keys, false); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + state.add_element_count(num_keys); + + filter_type filter{num_sub_filters}; + + state.collect_dram_throughput(); + state.collect_l1_hit_rates(); + state.collect_l2_hit_rates(); + state.collect_loads_efficiency(); + state.collect_stores_efficiency(); + + add_fpr_summary(state, filter); + + filter.add(keys.begin(), keys.end()); + + state.exec([&](nvbench::launch& launch) { + filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); + }); +} + NVBENCH_BENCH_TYPES(bloom_filter_contains, NVBENCH_TYPE_AXES(nvbench::type_list, nvbench::type_list, @@ -122,4 +172,13 @@ NVBENCH_BENCH_TYPES(bloom_filter_contains, .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}); \ No newline at end of file + .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}); + +NVBENCH_BENCH_TYPES(arrow_bloom_filter_contains, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list)) + .set_name("arrow_bloom_filter_contains_unique_size") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("NumInputs", {defaults::BF_N}) + .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE); \ No newline at end of file diff --git a/examples/bloom_filter/host_bulk_example.cu b/examples/bloom_filter/host_bulk_example.cu index 14a2a5cfa..f02f6e657 100644 --- a/examples/bloom_filter/host_bulk_example.cu +++ b/examples/bloom_filter/host_bulk_example.cu @@ -25,15 +25,21 @@ int main(void) { - // Generate 10'000 keys and insert the first 5'000 into the filter. - int constexpr num_keys = 10'000; - int constexpr num_tp = num_keys * 0.5; - int constexpr num_tn = num_keys - num_tp; + int constexpr num_keys = 10'000; ///< Generate 10'000 keys + int constexpr num_tp = num_keys * 0.5; ///< Insert the first half keys into the filter. + int constexpr num_tn = num_keys - num_tp; + int constexpr sub_filters = 200; ///< 200 sub-filters per bloom filter - // Spawn a filter with 200 sub-filters. - cuco::bloom_filter filter{200}; + // key type for bloom filter + using key_type = int; - thrust::device_vector keys(num_keys); + // Spawn a bloom filter with default policy and 200 sub-filters. + cuco::bloom_filter filter{sub_filters}; + + std::cout << "Bulk insert into bloom filter with default fingerprint generation policy: " + << std::endl; + + thrust::device_vector keys(num_keys); thrust::sequence(keys.begin(), keys.end(), 1); auto tp_begin = keys.begin(); diff --git a/include/cuco/bloom_filter.cuh b/include/cuco/bloom_filter.cuh index 16642ab57..b05e9469e 100644 --- a/include/cuco/bloom_filter.cuh +++ b/include/cuco/bloom_filter.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include #include @@ -55,13 +55,13 @@ namespace cuco { * @tparam Extent Size type that is used to determine the number of blocks in the filter * @tparam Scope The scope in which operations will be performed by individual threads * @tparam Policy Type that defines how to generate and store key fingerprints (see - * `cuco/bloom_filter_policy.cuh`) + * `cuco/bloom_filter_policies.cuh`) * @tparam Allocator Type of allocator used for device-accessible storage */ template , cuda::thread_scope Scope = cuda::thread_scope_device, - class Policy = cuco::bloom_filter_policy, std::uint32_t, 8>, + class Policy = cuco::default_filter_policy, std::uint32_t, 8>, class Allocator = cuco::cuda_allocator> class bloom_filter { public: @@ -109,7 +109,7 @@ class bloom_filter { * * @param num_blocks Number of sub-filters or blocks * @param scope The scope in which operations will be performed - * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policy.cuh`) + * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`) * @param alloc Allocator used for allocating device-accessible storage * @param stream CUDA stream used to initialize the filter */ diff --git a/include/cuco/bloom_filter_policy.cuh b/include/cuco/bloom_filter_policies.cuh similarity index 81% rename from include/cuco/bloom_filter_policy.cuh rename to include/cuco/bloom_filter_policies.cuh index 5b8f87d95..cf9ddb371 100644 --- a/include/cuco/bloom_filter_policy.cuh +++ b/include/cuco/bloom_filter_policies.cuh @@ -16,14 +16,25 @@ #pragma once -#include +#include +#include #include namespace cuco { /** - * @brief A policy that defines how a Blocked Bloom Filter generates and stores a key's fingerprint. + * @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's + * fingerprint. + * + * @tparam Key The type of the values to generate a fingerprint for. + */ +template +using arrow_filter_policy = detail::arrow_filter_policy; + +/** + * @brief The default policy that defines how a Blocked Bloom Filter generates and stores a key's + * fingerprint. * * @note `Word` type must be an atomically updatable integral type. `WordsPerBlock` must * be a power-of-two. @@ -33,8 +44,8 @@ namespace cuco { * @tparam WordsPerBlock Number of words/segments in each block */ template -class bloom_filter_policy { - using impl_type = cuco::detail::bloom_filter_policy_impl; +class default_filter_policy { + using impl_type = cuco::detail::default_filter_policy_impl; public: using hasher = typename impl_type::hasher; ///< Type of the hash function @@ -48,7 +59,7 @@ class bloom_filter_policy { public: /** - * @brief Constructs the `bloom_filter_policy` object. + * @brief Constructs the `default_filter_policy` object. * * @throws Compile-time error if the specified number of words in a filter block is not a * power-of-two or is larger than 32. If called from host: throws exception; If called from @@ -64,8 +75,8 @@ class bloom_filter_policy { * @param pattern_bits Number of bits in a key's fingerprint * @param hash Hash function used to generate a key's fingerprint */ - __host__ __device__ constexpr bloom_filter_policy(std::uint32_t pattern_bits = words_per_block, - Hash hash = {}); + __host__ __device__ constexpr default_filter_policy(std::uint32_t pattern_bits = words_per_block, + Hash hash = {}); /** * @brief Generates the hash value for a given key. @@ -116,4 +127,4 @@ class bloom_filter_policy { } // namespace cuco -#include \ No newline at end of file +#include \ No newline at end of file diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh index e4434845f..98baaae75 100644 --- a/include/cuco/bloom_filter_ref.cuh +++ b/include/cuco/bloom_filter_ref.cuh @@ -33,7 +33,7 @@ namespace cuco { * @tparam Extent Size type that is used to determine the number of blocks in the filter * @tparam Scope The scope in which operations will be performed by individual threads * @tparam Policy Type that defines how to generate and store key fingerprints (see - * `cuco/bloom_filter_policy.cuh`) + * `cuco/bloom_filter_policies.cuh`) */ template class bloom_filter_ref { @@ -60,7 +60,7 @@ class bloom_filter_ref { * @param data Pointer to the storage span of the filter * @param num_blocks Number of sub-filters or blocks * @param scope The scope in which operations will be performed - * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policy.cuh`) + * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`) */ __host__ __device__ explicit constexpr bloom_filter_ref(word_type* data, Extent num_blocks, diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh new file mode 100644 index 000000000..23b95793e --- /dev/null +++ b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +#include +#include + +namespace cuco::detail { + +/** + * @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's + * fingerprint. + * + * Reference: + * https://github.com/apache/arrow/blob/be1dcdb96b030639c0b56955c4c62f9d6b03f473/cpp/src/parquet/bloom_filter.cc#L219-L230 + * + * Example: + * @code{.cpp} + * template + * void bulk_insert_and_eval_arrow_policy_bloom_filter(device_vector const& positive_keys, + * device_vector const& negative_keys) + * { + * using policy_type = cuco::arrow_filter_policy; + * + * // Warn or throw if the number of filter blocks is greater than maximum used by Arrow policy. + * static_assert(NUM_FILTER_BLOCKS <= policy_type::max_filter_blocks, "NUM_FILTER_BLOCKS must be + * in range: [1, 4194304]"); + * + * // Create a bloom filter with Arrow policy + * cuco::bloom_filter, + * cuda::thread_scope_device, policy_type> filter{NUM_FILTER_BLOCKS}; + * + * // Add positive keys to the bloom filter + * filter.add(positive_keys.begin(), positive_keys.end()); + * + * auto const num_tp = positive_keys.size(); + * auto const num_tn = negative_keys.size(); + * + * // Vectors to store query results. + * thrust::device_vector true_positive_result(num_tp, false); + * thrust::device_vector true_negative_result(num_tn, false); + * + * // Query the bloom filter for the inserted keys. + * filter.contains(positive_keys.begin(), positive_keys.end(), true_positive_result.begin()); + * + * // We should see a true-positive rate of 1. + * float true_positive_rate = float(thrust::count(thrust::device, + * true_positive_result.begin(), true_positive_result.end(), true)) / float(num_tp); + * + * // Query the bloom filter for the non-inserted keys. + * filter.contains(negative_keys.begin(), negative_keys.end(), true_negative_result.begin()); + * + * // We may see a false-positive rate > 0 depending on the number of bits in the + * // filter and the number of hashes used per key. + * float false_positive_rate = float(thrust::count(thrust::device, + * true_negative_result.begin(), true_negative_result.end(), true)) / float(num_tn); + * } + * @endcode + * + * @tparam Key The type of the values to generate a fingerprint for. + */ +template +class arrow_filter_policy { + public: + using hasher = cuco::xxhash_64; ///< xxhash_64 hasher for Arrow bloom filter policy + using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy + using hash_argument_type = typename hasher::argument_type; ///< Hash function input type + using hash_result_type = decltype(std::declval()( + std::declval())); ///< hash function output type + + static constexpr uint32_t bits_set_per_block = 8; ///< hardcoded bits set per Arrow filter block + static constexpr uint32_t words_per_block = 8; ///< hardcoded words per Arrow filter block + + static constexpr std::uint32_t bytes_per_filter_block = + 32; ///< Number of bytes in one Arrow filter block + static constexpr std::uint32_t max_arrow_filter_bytes = + 128 * 1024 * 1024; ///< Max bytes in Arrow bloom filter + static constexpr std::uint32_t max_filter_blocks = + (max_arrow_filter_bytes / + bytes_per_filter_block); ///< Max sub-filter blocks allowed in Arrow bloom filter + + private: + // Arrow's block-based bloom filter algorithm needs these eight odd SALT values to calculate + // eight indexes of bit to set, one bit in each 32-bit (uint32_t) word. + __device__ static constexpr cuda::std::array SALT() + { + return {0x47b6137bU, + 0x44974d91U, + 0x8824ad5bU, + 0xa2b7289dU, + 0x705495c7U, + 0x2df1424bU, + 0x9efc4947U, + 0x5c6bfb31U}; + } + + public: + /** + * @brief Constructs the `arrow_filter_policy` object. + * + * @note The number of filter blocks with Arrow policy must be in the + * range of [1, 4194304]. If the bloom filter is constructed with a larger + * number of blocks, only the first 4194304 (128MB) blocks will be used. + * + * @param hash Hash function used to generate a key's fingerprint + */ + __host__ __device__ constexpr arrow_filter_policy(hasher hash = {}) : hash_{hash} {} + + /** + * @brief Generates the hash value for a given key. + * + * @param key The key to hash + * + * @return The hash value of the key + */ + __device__ constexpr hash_result_type hash(hash_argument_type const& key) const + { + return hash_(key); + } + + /** + * @brief Determines the filter block a key is added into. + * + * @note The number of filter blocks with Arrow policy must be in the + * range of [1, 4194304]. Passing a larger `num_blocks` will still + * upperbound the number of blocks used to the mentioned range. + * + * @tparam Extent Size type that is used to determine the number of blocks in the filter + * + * @param hash Hash value of the key + * @param num_blocks Number of block in the filter + * + * @return The block index for the given key's hash value + */ + template + __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const + { + constexpr auto hash_bits = cuda::std::numeric_limits::digits; + // TODO: assert if num_blocks > max_filter_blocks + auto const max_blocks = cuda::std::min(num_blocks, max_filter_blocks); + // Make sure we are only contained withing the `max_filter_blocks` blocks + return static_cast(((hash >> hash_bits) * max_blocks) >> hash_bits) % max_blocks; + } + + /** + * @brief Determines the fingerprint pattern for a word/segment within the filter block for a + * given key's hash value. + * + * @param hash Hash value of the key + * @param word_index Target word/segment within the filter block + * + * @return The bit pattern for the word/segment in the filter block + */ + __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const + { + // SALT array to calculate bit indexes for the current word + auto constexpr salt = SALT(); + word_type const key = static_cast(hash); + return word_type{1} << ((key * salt[word_index]) >> 27); + } + + private: + hasher hash_; +}; + +} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/bloom_filter/bloom_filter_policy.inl b/include/cuco/detail/bloom_filter/default_filter_policy.inl similarity index 59% rename from include/cuco/detail/bloom_filter/bloom_filter_policy.inl rename to include/cuco/detail/bloom_filter/default_filter_policy.inl index f117798f9..eb8dbf703 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_policy.inl +++ b/include/cuco/detail/bloom_filter/default_filter_policy.inl @@ -21,33 +21,34 @@ namespace cuco { template -__host__ __device__ constexpr bloom_filter_policy::bloom_filter_policy( - uint32_t pattern_bits, Hash hash) +__host__ + __device__ constexpr default_filter_policy::default_filter_policy( + uint32_t pattern_bits, Hash hash) : impl_{pattern_bits, hash} { } template -__device__ constexpr typename bloom_filter_policy::hash_result_type -bloom_filter_policy::hash( - typename bloom_filter_policy::hash_argument_type const& key) const +__device__ constexpr typename default_filter_policy::hash_result_type +default_filter_policy::hash( + typename default_filter_policy::hash_argument_type const& key) const { return impl_.hash(key); } template template -__device__ constexpr auto bloom_filter_policy::block_index( - typename bloom_filter_policy::hash_result_type hash, +__device__ constexpr auto default_filter_policy::block_index( + typename default_filter_policy::hash_result_type hash, Extent num_blocks) const { return impl_.block_index(hash, num_blocks); } template -__device__ constexpr typename bloom_filter_policy::word_type -bloom_filter_policy::word_pattern( - bloom_filter_policy::hash_result_type hash, +__device__ constexpr typename default_filter_policy::word_type +default_filter_policy::word_pattern( + default_filter_policy::hash_result_type hash, std::uint32_t word_index) const { return impl_.word_pattern(hash, word_index); diff --git a/include/cuco/detail/bloom_filter/bloom_filter_policy_impl.cuh b/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh similarity index 95% rename from include/cuco/detail/bloom_filter/bloom_filter_policy_impl.cuh rename to include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh index d7af7667c..ae2331b44 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_policy_impl.cuh +++ b/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh @@ -29,7 +29,7 @@ namespace cuco::detail { template -class bloom_filter_policy_impl { +class default_filter_policy_impl { public: using hasher = Hash; using word_type = Word; @@ -43,7 +43,8 @@ class bloom_filter_policy_impl { static constexpr std::uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1); public: - __host__ __device__ explicit constexpr bloom_filter_policy_impl(uint32_t pattern_bits, Hash hash) + __host__ __device__ explicit constexpr default_filter_policy_impl(uint32_t pattern_bits, + Hash hash) : pattern_bits_{pattern_bits}, min_bits_per_word_{pattern_bits_ / words_per_block}, remainder_bits_{pattern_bits_ % words_per_block}, diff --git a/tests/bloom_filter/unique_sequence_test.cu b/tests/bloom_filter/unique_sequence_test.cu index 3919e77bf..7069f4c6a 100644 --- a/tests/bloom_filter/unique_sequence_test.cu +++ b/tests/bloom_filter/unique_sequence_test.cu @@ -57,7 +57,7 @@ void test_unique_sequence(Filter& filter, size_type num_keys) REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), thrust::identity{})); } - SECTION("After clearing the flter no keys should be contained.") + SECTION("After clearing the filter no keys should be contained.") { filter.clear(); filter.contains(keys.begin(), keys.end(), contained.begin()); @@ -84,13 +84,14 @@ void test_unique_sequence(Filter& filter, size_type num_keys) // TODO test FPR but how? } -TEMPLATE_TEST_CASE_SIG("Unique sequence", - "", - ((class Key, class Policy), Key, Policy), - (int32_t, cuco::bloom_filter_policy, uint32_t, 1>), - (int32_t, cuco::bloom_filter_policy, uint32_t, 8>), - (int32_t, cuco::bloom_filter_policy, uint64_t, 1>), - (int32_t, cuco::bloom_filter_policy, uint64_t, 8>)) +TEMPLATE_TEST_CASE_SIG( + "Unique sequence with default policy", + "", + ((class Key, class Policy), Key, Policy), + (int32_t, cuco::default_filter_policy, uint32_t, 1>), + (int32_t, cuco::default_filter_policy, uint32_t, 8>), + (int32_t, cuco::default_filter_policy, uint64_t, 1>), + (int32_t, cuco::default_filter_policy, uint64_t, 8>)) { using filter_type = cuco::bloom_filter, cuda::thread_scope_device, Policy>; @@ -103,3 +104,19 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence", test_unique_sequence(filter, num_keys); } + +TEMPLATE_TEST_CASE_SIG("Unique sequence with arrow policy", + "", + ((class Key, class Policy), Key, Policy), + (int32_t, cuco::arrow_filter_policy), + (uint64_t, cuco::arrow_filter_policy), + (float, cuco::arrow_filter_policy)) +{ + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, Policy>; + constexpr size_type num_keys{400}; + + auto filter = filter_type{1000}; + + test_unique_sequence(filter, num_keys); +}