diff --git a/include/hibf/interleaved_bloom_filter.hpp b/include/hibf/interleaved_bloom_filter.hpp index 4194efba..ff901f20 100644 --- a/include/hibf/interleaved_bloom_filter.hpp +++ b/include/hibf/interleaved_bloom_filter.hpp @@ -206,10 +206,15 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector * \param bins_ The number of bins. * \param size The bitvector size. * \param funs The number of hash functions. Default 2. At least 1, at most 5. - * \param track_occupancy_ Whether to track the occupancy of the bins. + * \param empty_bin_fraction The fraction of total technical bins that should be empty. * * \details * + * Upon construction, `_bins` many bins are immediately accessible. + * Additionally, enough space is reserve to accomodate a total of at least `_bins / empty_bin_fraction` many bins. + * For example, with `bins_ = 64` and `empty_bin_fraction = 0.5`, the Interleaved Bloom Filter will reserve space + * for 128 bins, 64 of which are immediately accessible and 64 of which are reserved for future use. + * * ### Example * * \include test/snippet/ibf/interleaved_bloom_filter_constructor.cpp @@ -217,7 +222,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector interleaved_bloom_filter(seqan::hibf::bin_count const bins_, seqan::hibf::bin_size const size, seqan::hibf::hash_function_count const funs = seqan::hibf::hash_function_count{2u}, - bool const track_occupancy_ = false); + double const empty_bin_fraction = 0.0); /*!\brief Construct an Interleaved Bloom Filter. * \param configuration The seqan::hibf::config. diff --git a/include/hibf/misc/add_empty_bins.hpp b/include/hibf/misc/add_empty_bins.hpp new file mode 100644 index 00000000..c3f099cd --- /dev/null +++ b/include/hibf/misc/add_empty_bins.hpp @@ -0,0 +1,25 @@ +// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include // for floor +#include // for size_t + +#include + +namespace seqan::hibf +{ + +/*!\brief Returns the total number of bins such that a `fraction` of bins is empty, and `tmax` many bins are non-empty. + * \param[in] tmax The number of of non-empty bins. + * \param[in] fraction The fraction of the total number of bins that should be empty. + * \ingroup hibf + */ +[[nodiscard]] constexpr size_t add_empty_bins(size_t const tmax, double const fraction) noexcept +{ + return std::floor(tmax / (1.0 - fraction)); +} + +} // namespace seqan::hibf diff --git a/include/hibf/misc/subtract_empty_bins.hpp b/include/hibf/misc/subtract_empty_bins.hpp index e1033784..ae107994 100644 --- a/include/hibf/misc/subtract_empty_bins.hpp +++ b/include/hibf/misc/subtract_empty_bins.hpp @@ -5,6 +5,7 @@ #pragma once #include // for clamp +#include // for ceil #include // for size_t #include @@ -25,7 +26,7 @@ namespace seqan::hibf if (fraction == 0.0 || tmax <= 2u) return tmax; - size_t const number_of_empty_bins = std::clamp(tmax * fraction, 1, tmax - 2); + size_t const number_of_empty_bins = std::clamp(std::ceil(tmax * fraction), 1, tmax - 2); return tmax - number_of_empty_bins; } diff --git a/src/build/construct_ibf.cpp b/src/build/construct_ibf.cpp index ef529650..3cefaff9 100644 --- a/src/build/construct_ibf.cpp +++ b/src/build/construct_ibf.cpp @@ -51,7 +51,7 @@ seqan::hibf::interleaved_bloom_filter construct_ibf(robin_hood::unordered_flat_s seqan::hibf::interleaved_bloom_filter ibf{bin_count, bin_size, seqan::hibf::hash_function_count{data.config.number_of_hash_functions}, - data.config.empty_bin_fraction > 0.0}; + data.config.empty_bin_fraction}; local_index_allocation_timer.stop(); data.index_allocation_timer += local_index_allocation_timer; diff --git a/src/config.cpp b/src/config.cpp index b9f95d34..65e50b36 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -17,6 +17,7 @@ #include // for config #include // for meta_header, meta_hibf_config_end, meta_hibf_config_start #include // for next_multiple_of_64 +#include // for subtract_empty_bins namespace seqan::hibf { @@ -116,6 +117,12 @@ void config::validate_and_set_defaults() if (empty_bin_fraction < 0.0 || empty_bin_fraction >= 1.0) throw std::invalid_argument{"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."}; + if (empty_bin_fraction != 0.0) + { + size_t const substracted = seqan::hibf::subtract_empty_bins(tmax, empty_bin_fraction); + empty_bin_fraction = 1.0 - static_cast(substracted) / tmax; + } + if (alpha < 0.0) throw std::invalid_argument{"[HIBF CONFIG ERROR] config::alpha must be positive."}; diff --git a/src/hierarchical_interleaved_bloom_filter.cpp b/src/hierarchical_interleaved_bloom_filter.cpp index 43f6f4ea..b3a7e1a6 100644 --- a/src/hierarchical_interleaved_bloom_filter.cpp +++ b/src/hierarchical_interleaved_bloom_filter.cpp @@ -63,13 +63,15 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, if (current_node.max_bin_is_merged()) { // recursively initialize favourite child first - technical_bin_to_ibf_id[current_node.max_bin_index] = + size_t const favourite_child_ibf_idx = hierarchical_build(hibf, kmers, current_node.children[current_node.favourite_child_idx.value()], data, false, ibf_pos); + technical_bin_to_ibf_id[current_node.max_bin_index] = favourite_child_ibf_idx; + hibf.prev_ibf_id[favourite_child_ibf_idx] = {.ibf_idx = ibf_pos, .bin_idx = current_node.max_bin_index}; return 1; } else // max bin is not a merged bin @@ -127,7 +129,7 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, robin_hood::unordered_flat_set local_kmers{}; size_t const local_ibf_pos = hierarchical_build(hibf, local_kmers, child, data, false, ibf_pos); auto parent_bin_index = child.parent_bin_index; - hibf.prev_ibf_id[local_ibf_pos] = {.ibf_idx = parent_ibf_idx, .bin_idx = parent_bin_index}; + hibf.prev_ibf_id[local_ibf_pos] = {.ibf_idx = ibf_pos, .bin_idx = parent_bin_index}; { size_t const mutex_id{parent_bin_index / 64}; std::lock_guard guard{local_ibf_mutex[mutex_id]}; diff --git a/src/interleaved_bloom_filter.cpp b/src/interleaved_bloom_filter.cpp index a0127c28..a2644730 100644 --- a/src/interleaved_bloom_filter.cpp +++ b/src/interleaved_bloom_filter.cpp @@ -15,6 +15,7 @@ #include // for config, insert_iterator #include // for hash, unordered_flat_set #include // for interleaved_bloom_filter, bin_count, bin_index, bin_size, hash_... +#include // for add_empty_bins #include // for bit_vector #include // for divide_and_ceil #include // for assert, unreachable @@ -32,11 +33,11 @@ namespace seqan::hibf interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count const bins_, seqan::hibf::bin_size const size, seqan::hibf::hash_function_count const funs, - bool const track_occupancy_) : + double const empty_bin_fraction) : bins{bins_.value}, bin_size_{size.value}, hash_funs{funs.value}, - track_occupancy{track_occupancy_} + track_occupancy{empty_bin_fraction > 0.0} { if (bins == 0) throw std::logic_error{"The number of bins must be > 0."}; @@ -47,7 +48,14 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count const hash_shift = std::countl_zero(bin_size_); bin_words = divide_and_ceil(bins, 64u); - technical_bins = bin_words * 64u; + // Note: If the IBF is constructed as part of an HIBF, the layout only contains information about the + // non-empty bins. Hence, we need to add the empty bins to the total number of bins. + // Example: tmax = 320, empty_bin_fraction = 0.2 + // user bins = 320 * (1 - 0.2) = 256 + // The layout only contains information about the 256 bins. + // The HIBF construction will then request an IBF with 256 bins. + size_t const user_and_empty_bins = add_empty_bins(bins, empty_bin_fraction); + technical_bins = next_multiple_of_64(user_and_empty_bins); resize(technical_bins * bin_size_); occupancy.resize(technical_bins, 0u); } @@ -105,7 +113,7 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_ interleaved_bloom_filter{seqan::hibf::bin_count{configuration.number_of_user_bins}, seqan::hibf::bin_size{max_bin_size(configuration, max_bin_elements)}, seqan::hibf::hash_function_count{configuration.number_of_hash_functions}, - configuration.empty_bin_fraction > 0.0} + configuration.empty_bin_fraction} { size_t const chunk_size = std::clamp(std::bit_ceil(bin_count() / configuration.threads), 8u, 64u); @@ -149,12 +157,13 @@ void interleaved_bloom_filter::clear(bin_index const bin) noexcept bool interleaved_bloom_filter::try_increase_bin_number_to(seqan::hibf::bin_count const new_bin_count) noexcept { size_t const new_bins = new_bin_count.value; - size_t const new_bin_words = divide_and_ceil(new_bins, 64u); + size_t const new_technical_bins = next_multiple_of_64(new_bins); - if (new_bins < bins || new_bin_words > bin_words) + if (new_bins < bins || new_technical_bins > technical_bins) return false; bins = new_bins; + bin_words = divide_and_ceil(new_bins, 64u); return true; } diff --git a/src/layout/hierarchical_binning.cpp b/src/layout/hierarchical_binning.cpp index 1b6474de..73f495ab 100644 --- a/src/layout/hierarchical_binning.cpp +++ b/src/layout/hierarchical_binning.cpp @@ -17,6 +17,7 @@ #include // for hierarchical_binning #include // for layout #include // for simple_binning +#include // for add_empty_bins #include // for divide_and_ceil #include // for next_multiple_of_64 #include // for subtract_empty_bins @@ -411,7 +412,7 @@ size_t hierarchical_binning::add_lower_level(data_store & libf_data) const size_t const number_of_user_bins = libf_data.positions.size(); // now do the binning for the low-level IBF: - if (number_of_user_bins > config.tmax) + if (number_of_user_bins > subtract_empty_bins(config.tmax, config.empty_bin_fraction)) { // recursively call hierarchical binning if there are still too many UBs return hierarchical_binning{libf_data, config}.execute(); // return id of maximum technical bin @@ -420,8 +421,9 @@ size_t hierarchical_binning::add_lower_level(data_store & libf_data) const { // use simple binning to distribute remaining UBs // Simple binning is not bound by config.tmax - size_t const num_user_bins_next_64 = next_multiple_of_64(number_of_user_bins); - size_t const number_of_technical_bins = subtract_empty_bins(num_user_bins_next_64, config.empty_bin_fraction); + size_t const user_and_empty_bins = add_empty_bins(number_of_user_bins, config.empty_bin_fraction); + size_t const number_of_technical_bins = + subtract_empty_bins(next_multiple_of_64(user_and_empty_bins), config.empty_bin_fraction); return simple_binning{libf_data, number_of_technical_bins}.execute(); // return id of maximum technical bin } } diff --git a/test/unit/hibf/config_test.cpp b/test/unit/hibf/config_test.cpp index 02adfb6e..ebe5d89f 100644 --- a/test/unit/hibf/config_test.cpp +++ b/test/unit/hibf/config_test.cpp @@ -11,9 +11,11 @@ #include // for allocator, char_traits, string #include // for move -#include // for config, insert_iterator -#include // for test_serialisation -#include // for EXPECT_THROW_MSG +#include // for config, insert_iterator +#include // for add_empty_bins +#include // for subtract_empty_bins +#include // for test_serialisation +#include // for EXPECT_THROW_MSG TEST(config_test, write_to) { @@ -405,6 +407,45 @@ TEST(config_test, validate_and_set_defaults) } } +TEST(config_test, empty_bin_fraction) +{ + seqan::hibf::config config{.input_fn = [](size_t const, seqan::hibf::insert_iterator) {}, + .number_of_user_bins = 1u}; + + for (size_t top_level_tmax = 64; top_level_tmax <= 1024; top_level_tmax += 64) + { + config.tmax = top_level_tmax; + + // Some deviation for high fractions + for (double fraction = 0.00; fraction <= 0.97; fraction += 0.01) + { + size_t const top_level_subtracted = seqan::hibf::subtract_empty_bins(top_level_tmax, fraction); + double const adjusted_fraction = 1.0 - static_cast(top_level_subtracted) / top_level_tmax; + + config.empty_bin_fraction = fraction; + config.validate_and_set_defaults(); + EXPECT_EQ(config.empty_bin_fraction, adjusted_fraction); + + // Roundtrip test + for (size_t lower_level_tmax = 64; lower_level_tmax <= top_level_tmax; lower_level_tmax += 64) + { + size_t const lower_level_subtracted = + seqan::hibf::subtract_empty_bins(lower_level_tmax, adjusted_fraction); + size_t const lower_level_added = seqan::hibf::add_empty_bins(lower_level_subtracted, adjusted_fraction); + + ASSERT_EQ(seqan::hibf::next_multiple_of_64(lower_level_added), lower_level_tmax) + << "top_level_tmax: " << top_level_tmax << '\n' + << "lower_level_tmax: " << lower_level_tmax << '\n' + << "fraction: " << fraction << '\n' + << "top_level_subtracted: " << top_level_subtracted << '\n' + << "adjusted_fraction: " << adjusted_fraction << '\n' + << "lower_level_subtracted: " << lower_level_subtracted << '\n' + << "lower_level_added: " << lower_level_added << '\n'; + } + } + } +} + TEST(config_test, serialisation) { seqan::hibf::config config{.number_of_user_bins = 123456789, diff --git a/test/unit/hibf/interleaved_bloom_filter_test.cpp b/test/unit/hibf/interleaved_bloom_filter_test.cpp index 21e3000c..b2367e98 100644 --- a/test/unit/hibf/interleaved_bloom_filter_test.cpp +++ b/test/unit/hibf/interleaved_bloom_filter_test.cpp @@ -209,19 +209,18 @@ TEST(ibf_test, emplace) TEST(ibf_test, emplace_with_occupancy) { // 1. Construct and emplace - seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{128u}, + seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{64u}, seqan::hibf::bin_size{512}, seqan::hibf::hash_function_count{2u}, - true}; + 0.5}; // 64 empty bins for (size_t bin_idx : std::views::iota(0, 64)) for (size_t hash : std::views::iota(0, 64)) ibf.emplace(hash, seqan::hibf::bin_index{bin_idx}); // 2. Test for correctness - auto agent = ibf.containment_agent(); - std::vector expected(128); + std::vector expected(64); std::fill(expected.begin(), expected.begin() + 64u, true); for (size_t hash : std::views::iota(0, 64)) { @@ -452,9 +451,34 @@ TEST(ibf_test, try_increase_bin_number_to) EXPECT_EQ(ibf.bit_size(), original_bitsize); } +TEST(ibf_test, try_increase_bin_number_to_empty_bins) +{ + seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{60u}, + seqan::hibf::bin_size{1024u}, + seqan::hibf::hash_function_count{2u}, + 0.5}; // 60 empty bins + size_t const original_bitsize{ibf.bit_size()}; + + EXPECT_TRUE(ibf.try_increase_bin_number_to({64u})); + EXPECT_EQ(ibf.bin_count(), 64u); + EXPECT_EQ(ibf.bit_size(), original_bitsize); + + EXPECT_TRUE(ibf.try_increase_bin_number_to({120u})); + EXPECT_EQ(ibf.bin_count(), 120u); + EXPECT_EQ(ibf.bit_size(), original_bitsize); + + EXPECT_FALSE(ibf.try_increase_bin_number_to({129u})); + EXPECT_EQ(ibf.bin_count(), 120u); + EXPECT_EQ(ibf.bit_size(), original_bitsize); + + EXPECT_TRUE(ibf.try_increase_bin_number_to({128u})); + EXPECT_EQ(ibf.bin_count(), 128u); + EXPECT_EQ(ibf.bit_size(), original_bitsize); +} + TEST(ibf_test, increase_bin_number_to) { - seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{73u}, seqan::hibf::bin_size{1024u}}; + seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{73u}, seqan::hibf::bin_size{8u}}; size_t const original_bitsize{ibf.bit_size()}; // 1. Throw if trying to reduce number of bins. @@ -495,6 +519,63 @@ TEST(ibf_test, increase_bin_number_to) } } +TEST(ibf_test, increase_bin_number_to_empty_bins) +{ + seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{60u}, + seqan::hibf::bin_size{8u}, + seqan::hibf::hash_function_count{2u}, + 0.5}; // 60 empty bins + size_t const original_bitsize{ibf.bit_size()}; + + // 1. Throw if trying to reduce number of bins. + EXPECT_THROW(ibf.increase_bin_number_to(seqan::hibf::bin_count{50u}), std::invalid_argument); + EXPECT_EQ(ibf.bin_count(), 60u); + EXPECT_EQ(ibf.bit_size(), original_bitsize); + + // 2. No change in bin_words implies no change in size. + ibf.increase_bin_number_to({seqan::hibf::bin_count{60u}}); + EXPECT_EQ(ibf.bit_size(), original_bitsize); + EXPECT_EQ(ibf.bin_count(), 60u); + + // 3. If resizing takes place, the inserted values must still be valid. + auto hashes = std::views::iota(0, 64); + for (size_t current_bin : std::views::iota(0, 64)) // test correct resize for each bin individually + { + seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{64u}, + seqan::hibf::bin_size{8u}, + seqan::hibf::hash_function_count{2u}, + 0.5}; + + EXPECT_EQ(ibf.bit_size(), 1024u); + std::ranges::for_each(hashes, + [&ibf, ¤t_bin](auto const h) + { + ibf.emplace(h, seqan::hibf::bin_index{current_bin}); + }); + + ibf.increase_bin_number_to(seqan::hibf::bin_count{73u}); + EXPECT_EQ(ibf.bin_count(), 73u); + EXPECT_EQ(ibf.bit_size(), 1024); + + ibf.increase_bin_number_to(seqan::hibf::bin_count{135}); + EXPECT_EQ(ibf.bin_count(), 135u); + EXPECT_EQ(ibf.bit_size(), 1536); + + ibf.increase_bin_number_to(seqan::hibf::bin_count{205}); + EXPECT_EQ(ibf.bin_count(), 205); + EXPECT_EQ(ibf.bit_size(), 2048); + + std::vector expected(205, 0); + expected[current_bin] = 1; // none of the bins except current_bin stores the hash values. + auto agent = ibf.containment_agent(); + for (size_t const h : hashes) + { + auto & res = agent.bulk_contains(h); + EXPECT_RANGE_EQ(res, expected); + } + } +} + TEST(ibf_test, copy_agents) { // 1. Construct and emplace