Skip to content

Commit

Permalink
Merge pull request #2428 from eseiler/feature/ibf_clear
Browse files Browse the repository at this point in the history
[FEATURE] interleaved_bloom_filter::clear
  • Loading branch information
eseiler authored Mar 8, 2021
2 parents fddd897 + 5a54705 commit 696b6e3
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 4 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ If possible, provide tooling that performs the changes, e.g. a shell-script.
* The `seqan3::fm_index_cursor` exposes its suffix array interval ([\#2076](https://github.com/seqan/seqan3/pull/2076)).
* The `seqan3::interleaved_bloom_filter` supports counting occurrences of a range of values
([\#2373](https://github.com/seqan/seqan3/pull/2373)).
* The `seqan3::interleaved_bloom_filter` supports clearing of bins
([\#2428](https://github.com/seqan/seqan3/pull/2428)).

## Notable Bug-fixes

Expand Down
55 changes: 54 additions & 1 deletion include/seqan3/search/dream_index/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ class interleaved_bloom_filter
*
* \include test/snippet/search/dream_index/interleaved_bloom_filter_emplace.cpp
*/
void emplace(size_t const value, bin_index const bin)
void emplace(size_t const value, bin_index const bin) noexcept
//!\cond
requires (data_layout_mode == data_layout::uncompressed)
//!\endcond
Expand All @@ -293,6 +293,59 @@ class interleaved_bloom_filter
};
}

/*!\brief Clears a specific bin.
* \param[in] bin The bin index to clear.
*
* \attention This function is only available for **uncompressed** Interleaved Bloom Filters.
*
* \details
*
* ### Example
*
* \include test/snippet/search/dream_index/interleaved_bloom_filter_clear.cpp
*/
void clear(bin_index const bin) noexcept
//!\cond
requires (data_layout_mode == data_layout::uncompressed)
//!\endcond
{
assert(bin.get() < bins);
for (size_t idx = bin.get(), i = 0; i < bin_size_; idx += technical_bins, ++i)
data[idx] = 0;
}

/*!\brief Clears a range of bins.
* \tparam rng_t The type of the range. Must model std::ranges::forward_range and the reference type must be
* seqan3::bin_index.
* \param[in] bin_range The range of bins to clear.
*
* \attention This function is only available for **uncompressed** Interleaved Bloom Filters.
*
* \details
*
* ### Example
*
* \include test/snippet/search/dream_index/interleaved_bloom_filter_clear.cpp
*/
template <typename rng_t>
//!\cond
requires (data_layout_mode == data_layout::uncompressed)
//!\endcond
void clear(rng_t && bin_range) noexcept
{
static_assert(std::ranges::forward_range<rng_t>, "The range of bins to clear must model a forward_range.");
static_assert(std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<rng_t>>, bin_index>,
"The reference type of the range to clear must be seqan3::bin_index.");
#ifndef NDEBUG
for (auto && bin : bin_range)
assert(bin.get() < bins);
#endif // NDEBUG

for (size_t offset = 0, i = 0; i < bin_size_; offset += technical_bins, ++i)
for (auto && bin : bin_range)
data[bin.get() + offset] = 0;
}

/*!\brief Increases the number of bins stored in the Interleaved Bloom Filter.
* \param[in] new_bins_ The new number of bins.
* \throws std::invalid_argument If passed number of bins is smaller than current number of bins.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <benchmark/benchmark.h>

#include <seqan3/range/views/to.hpp>
#include <seqan3/range/views/zip.hpp>
#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>
#include <seqan3/test/performance/sequence_generator.hpp>
Expand All @@ -20,13 +21,19 @@ inline benchmark::Counter hashes_per_second(size_t const count)

static void arguments(benchmark::internal::Benchmark* b)
{
// Bins must be powers of two
for (int32_t bins : {64, 8192})
{
for (int32_t bits = 1<<15; bits <= 1<<20/* Increase for more extensive benchmarks*/; bits <<= 5)
// Size of the IBF will be 2^bits bits
for (int32_t bits = 15; bits <= 20; bits += 5)
{
for (int32_t hash_num = 2; hash_num < 3/* Increase for more extensive benchmarks*/; ++hash_num)
// The bits per bin must fit in an int32_t
if (bits - std::countr_zero(static_cast<uint32_t>(bins)) < 32)
{
b->Args({bins, bits/bins, hash_num, 1'000/* Increase for more extensive benchmarks*/});
for (int32_t hash_num = 2; hash_num < 3; ++hash_num)
{
b->Args({bins, (1LL << bits)/bins, hash_num, 1'000});
}
}
}
}
Expand Down Expand Up @@ -63,6 +70,52 @@ void emplace_benchmark(::benchmark::State & state)
state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
}

template <typename ibf_type>
void clear_benchmark(::benchmark::State & state)
{
auto && [ bin_indices, hash_values, ibf ] = set_up<ibf_type>(state.range(0),
state.range(1),
state.range(2),
state.range(3));
(void) bin_indices;
(void) hash_values;

std::vector<seqan3::bin_index> bin_range = std::views::iota(0u, static_cast<size_t>(state.range(0)))
| std::views::transform([] (size_t i) { return seqan3::bin_index{i}; })
| seqan3::views::to<std::vector>;

for (auto _ : state)
{
for (auto bin : bin_range)
ibf.clear(bin);
}

state.counters["bins/sec"] = hashes_per_second(std::ranges::size(bin_range));
}

template <typename ibf_type>
void clear_range_benchmark(::benchmark::State & state)
{
auto && [ bin_indices, hash_values, ibf ] = set_up<ibf_type>(state.range(0),
state.range(1),
state.range(2),
state.range(3));
(void) bin_indices;
(void) hash_values;

std::vector<seqan3::bin_index> bin_range = std::views::iota(0u, static_cast<size_t>(state.range(0)))
| std::views::transform([] (size_t i) { return seqan3::bin_index{i}; })
| seqan3::views::to<std::vector>;


for (auto _ : state)
{
ibf.clear(bin_range);
}

state.counters["bins/sec"] = hashes_per_second(std::ranges::size(bin_range));
}

template <typename ibf_type>
void bulk_contains_benchmark(::benchmark::State & state)
{
Expand Down Expand Up @@ -102,6 +155,10 @@ void bulk_count_benchmark(::benchmark::State & state)

BENCHMARK_TEMPLATE(emplace_benchmark,
seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>)->Apply(arguments);
BENCHMARK_TEMPLATE(clear_benchmark,
seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>)->Apply(arguments);
BENCHMARK_TEMPLATE(clear_range_benchmark,
seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>)->Apply(arguments);

BENCHMARK_TEMPLATE(bulk_contains_benchmark,
seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>)->Apply(arguments);
Expand Down
52 changes: 52 additions & 0 deletions test/snippet/search/dream_index/interleaved_bloom_filter_clear.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include <seqan3/alphabet/nucleotide/dna4.hpp>
#include <seqan3/core/debug_stream.hpp>
#include <seqan3/range/views/kmer_hash.hpp>
#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>

using seqan3::operator""_dna4;

int main()
{
seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{8u},
seqan3::bin_size{8192u},
seqan3::hash_function_count{2u}};

auto const sequence1 = "ACTGACTGACTGATC"_dna4;
auto const sequence2 = "GTGACTGACTGACTCG"_dna4;
auto const sequence3 = "AAAAAAACGATCGACA"_dna4;
auto hash_adaptor = seqan3::views::kmer_hash(seqan3::ungapped{5u});

// Insert all 5-mers of sequence1 into bin 0
for (auto && value : sequence1 | hash_adaptor)
ibf.emplace(value, seqan3::bin_index{0u});

// Insert all 5-mers of sequence2 into bin 4
for (auto && value : sequence2 | hash_adaptor)
ibf.emplace(value, seqan3::bin_index{4u});

// Insert all 5-mers of sequence3 into bin 7
for (auto && value : sequence3 | hash_adaptor)
ibf.emplace(value, seqan3::bin_index{7u});

auto agent = ibf.counting_agent();


// Count all 5-mers of sequence1 for all bins
seqan3::debug_stream << agent.bulk_count(sequence1 | hash_adaptor) << '\n'; // [11,0,0,0,9,0,0,0]

// Clear bin 0
ibf.clear(seqan3::bin_index{0u});

// After clearing, no 5-mers are found in bin 0
seqan3::debug_stream << agent.bulk_count(sequence1 | hash_adaptor) << '\n'; // [0,0,0,0,9,0,0,0]


// Search for specific values
seqan3::debug_stream << agent.bulk_count(std::views::iota(0u, 1024u)) << '\n'; // [0,0,0,0,7,0,0,10]

// Clear bin 4 and 7
ibf.clear(std::vector{seqan3::bin_index{4u}, seqan3::bin_index{7u}});

// After clearing, nothing is found
seqan3::debug_stream << agent.bulk_count(std::views::iota(0u, 1024u)) << '\n'; // [0,0,0,0,0,0,0,0]
}
55 changes: 55 additions & 0 deletions test/unit/search/dream_index/interleaved_bloom_filter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,61 @@ TYPED_TEST(interleaved_bloom_filter_test, emplace)
}
}

TYPED_TEST(interleaved_bloom_filter_test, clear)
{
// 1. Test uncompressed interleaved_bloom_filter directly because the compressed one is not mutable.
seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{64u},
seqan3::bin_size{1024u},
seqan3::hash_function_count{2u}};

for (size_t bin_idx : std::views::iota(0, 64))
for (size_t hash : std::views::iota(0, 64))
ibf.emplace(hash, seqan3::bin_index{bin_idx});

// 2. Clear a bin
ibf.clear(seqan3::bin_index{17u});

// 3. Construct either the uncompressed or compressed interleaved_bloom_filter and test set with bulk_contains
TypeParam ibf2{ibf};
auto agent = ibf2.membership_agent();
sdsl::bit_vector expected(64, 1); // every hash value should be set for every bin...
expected[17] = 0; // ...except bin 17
for (size_t hash : std::views::iota(0, 64))
{
auto & res = agent.bulk_contains(hash);
EXPECT_EQ(res, expected);
}
}

TYPED_TEST(interleaved_bloom_filter_test, clear_range)
{
// 1. Test uncompressed interleaved_bloom_filter directly because the compressed one is not mutable.
seqan3::interleaved_bloom_filter ibf{seqan3::bin_count{64u},
seqan3::bin_size{1024u},
seqan3::hash_function_count{2u}};

for (size_t bin_idx : std::views::iota(0, 64))
for (size_t hash : std::views::iota(0, 64))
ibf.emplace(hash, seqan3::bin_index{bin_idx});

// 2. Clear a range of bins
std::vector<seqan3::bin_index> bin_range{seqan3::bin_index{8u}, seqan3::bin_index{17u}, seqan3::bin_index{45u}};
ibf.clear(bin_range);

// 3. Construct either the uncompressed or compressed interleaved_bloom_filter and test set with bulk_contains
TypeParam ibf2{ibf};
auto agent = ibf2.membership_agent();
sdsl::bit_vector expected(64, 1); // every hash value should be set for every bin...
expected[8] = 0; // ...except bin 8
expected[17] = 0; // ...except bin 17
expected[45] = 0; // ...except bin 45
for (size_t hash : std::views::iota(0, 64))
{
auto & res = agent.bulk_contains(hash);
EXPECT_EQ(res, expected);
}
}

TYPED_TEST(interleaved_bloom_filter_test, counting)
{
// 1. Test uncompressed interleaved_bloom_filter directly because the compressed one is not mutable.
Expand Down

1 comment on commit 696b6e3

@vercel
Copy link

@vercel vercel bot commented on 696b6e3 Mar 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.