Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions include/hibf/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,18 +206,23 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
* \param bins_ The number of bins.
* \param size The bitvector size.
* \param funs The number of hash functions. Default 2. At least 1, at most 5.
* \param track_occupancy_ Whether to track the occupancy of the bins.
* \param empty_bin_fraction The fraction of total technical bins that should be empty.
*
* \details
*
* Upon construction, `_bins` many bins are immediately accessible.
* Additionally, enough space is reserve to accomodate a total of at least `_bins / empty_bin_fraction` many bins.
* For example, with `bins_ = 64` and `empty_bin_fraction = 0.5`, the Interleaved Bloom Filter will reserve space
* for 128 bins, 64 of which are immediately accessible and 64 of which are reserved for future use.
*
* ### Example
*
* \include test/snippet/ibf/interleaved_bloom_filter_constructor.cpp
*/
interleaved_bloom_filter(seqan::hibf::bin_count const bins_,
seqan::hibf::bin_size const size,
seqan::hibf::hash_function_count const funs = seqan::hibf::hash_function_count{2u},
bool const track_occupancy_ = false);
double const empty_bin_fraction = 0.0);

/*!\brief Construct an Interleaved Bloom Filter.
* \param configuration The seqan::hibf::config.
Expand Down
25 changes: 25 additions & 0 deletions include/hibf/misc/add_empty_bins.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <cmath> // for floor
#include <cstddef> // for size_t

#include <hibf/platform.hpp>

namespace seqan::hibf
{

/*!\brief Returns the total number of bins such that a `fraction` of bins is empty, and `tmax` many bins are non-empty.
* \param[in] tmax The number of of non-empty bins.
* \param[in] fraction The fraction of the total number of bins that should be empty.
* \ingroup hibf
*/
[[nodiscard]] constexpr size_t add_empty_bins(size_t const tmax, double const fraction) noexcept
{
return std::floor(tmax / (1.0 - fraction));
}

} // namespace seqan::hibf
3 changes: 2 additions & 1 deletion include/hibf/misc/subtract_empty_bins.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#pragma once

#include <algorithm> // for clamp
#include <cmath> // for ceil
#include <cstddef> // for size_t

#include <hibf/platform.hpp>
Expand All @@ -25,7 +26,7 @@ namespace seqan::hibf
if (fraction == 0.0 || tmax <= 2u)
return tmax;

size_t const number_of_empty_bins = std::clamp<size_t>(tmax * fraction, 1, tmax - 2);
size_t const number_of_empty_bins = std::clamp<size_t>(std::ceil(tmax * fraction), 1, tmax - 2);
return tmax - number_of_empty_bins;
}

Expand Down
2 changes: 1 addition & 1 deletion src/build/construct_ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ seqan::hibf::interleaved_bloom_filter construct_ibf(robin_hood::unordered_flat_s
seqan::hibf::interleaved_bloom_filter ibf{bin_count,
bin_size,
seqan::hibf::hash_function_count{data.config.number_of_hash_functions},
data.config.empty_bin_fraction > 0.0};
data.config.empty_bin_fraction};

local_index_allocation_timer.stop();
data.index_allocation_timer += local_index_allocation_timer;
Expand Down
7 changes: 7 additions & 0 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <hibf/config.hpp> // for config
#include <hibf/layout/prefixes.hpp> // for meta_header, meta_hibf_config_end, meta_hibf_config_start
#include <hibf/misc/next_multiple_of_64.hpp> // for next_multiple_of_64
#include <hibf/misc/subtract_empty_bins.hpp> // for subtract_empty_bins

namespace seqan::hibf
{
Expand Down Expand Up @@ -116,6 +117,12 @@ void config::validate_and_set_defaults()
if (empty_bin_fraction < 0.0 || empty_bin_fraction >= 1.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."};

if (empty_bin_fraction != 0.0)
{
size_t const substracted = seqan::hibf::subtract_empty_bins(tmax, empty_bin_fraction);
empty_bin_fraction = 1.0 - static_cast<double>(substracted) / tmax;
}

if (alpha < 0.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::alpha must be positive."};

Expand Down
6 changes: 4 additions & 2 deletions src/hierarchical_interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,15 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf,
if (current_node.max_bin_is_merged())
{
// recursively initialize favourite child first
technical_bin_to_ibf_id[current_node.max_bin_index] =
size_t const favourite_child_ibf_idx =
hierarchical_build(hibf,
kmers,
current_node.children[current_node.favourite_child_idx.value()],
data,
false,
ibf_pos);
technical_bin_to_ibf_id[current_node.max_bin_index] = favourite_child_ibf_idx;
hibf.prev_ibf_id[favourite_child_ibf_idx] = {.ibf_idx = ibf_pos, .bin_idx = current_node.max_bin_index};
return 1;
}
else // max bin is not a merged bin
Expand Down Expand Up @@ -127,7 +129,7 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf,
robin_hood::unordered_flat_set<uint64_t> local_kmers{};
size_t const local_ibf_pos = hierarchical_build(hibf, local_kmers, child, data, false, ibf_pos);
auto parent_bin_index = child.parent_bin_index;
hibf.prev_ibf_id[local_ibf_pos] = {.ibf_idx = parent_ibf_idx, .bin_idx = parent_bin_index};
hibf.prev_ibf_id[local_ibf_pos] = {.ibf_idx = ibf_pos, .bin_idx = parent_bin_index};
{
size_t const mutex_id{parent_bin_index / 64};
std::lock_guard<std::mutex> guard{local_ibf_mutex[mutex_id]};
Expand Down
21 changes: 15 additions & 6 deletions src/interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/contrib/robin_hood.hpp> // for hash, unordered_flat_set
#include <hibf/interleaved_bloom_filter.hpp> // for interleaved_bloom_filter, bin_count, bin_index, bin_size, hash_...
#include <hibf/misc/add_empty_bins.hpp> // for add_empty_bins
#include <hibf/misc/bit_vector.hpp> // for bit_vector
#include <hibf/misc/divide_and_ceil.hpp> // for divide_and_ceil
#include <hibf/misc/unreachable.hpp> // for assert, unreachable
Expand All @@ -32,11 +33,11 @@ namespace seqan::hibf
interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count const bins_,
seqan::hibf::bin_size const size,
seqan::hibf::hash_function_count const funs,
bool const track_occupancy_) :
double const empty_bin_fraction) :
bins{bins_.value},
bin_size_{size.value},
hash_funs{funs.value},
track_occupancy{track_occupancy_}
track_occupancy{empty_bin_fraction > 0.0}
{
if (bins == 0)
throw std::logic_error{"The number of bins must be > 0."};
Expand All @@ -47,7 +48,14 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count const

hash_shift = std::countl_zero(bin_size_);
bin_words = divide_and_ceil(bins, 64u);
technical_bins = bin_words * 64u;
// Note: If the IBF is constructed as part of an HIBF, the layout only contains information about the
// non-empty bins. Hence, we need to add the empty bins to the total number of bins.
// Example: tmax = 320, empty_bin_fraction = 0.2
// user bins = 320 * (1 - 0.2) = 256
// The layout only contains information about the 256 bins.
// The HIBF construction will then request an IBF with 256 bins.
size_t const user_and_empty_bins = add_empty_bins(bins, empty_bin_fraction);
technical_bins = next_multiple_of_64(user_and_empty_bins);
resize(technical_bins * bin_size_);
occupancy.resize(technical_bins, 0u);
}
Expand Down Expand Up @@ -105,7 +113,7 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_
interleaved_bloom_filter{seqan::hibf::bin_count{configuration.number_of_user_bins},
seqan::hibf::bin_size{max_bin_size(configuration, max_bin_elements)},
seqan::hibf::hash_function_count{configuration.number_of_hash_functions},
configuration.empty_bin_fraction > 0.0}
configuration.empty_bin_fraction}
{
size_t const chunk_size = std::clamp<size_t>(std::bit_ceil(bin_count() / configuration.threads), 8u, 64u);

Expand Down Expand Up @@ -149,12 +157,13 @@ void interleaved_bloom_filter::clear(bin_index const bin) noexcept
bool interleaved_bloom_filter::try_increase_bin_number_to(seqan::hibf::bin_count const new_bin_count) noexcept
{
size_t const new_bins = new_bin_count.value;
size_t const new_bin_words = divide_and_ceil(new_bins, 64u);
size_t const new_technical_bins = next_multiple_of_64(new_bins);

if (new_bins < bins || new_bin_words > bin_words)
if (new_bins < bins || new_technical_bins > technical_bins)
return false;

bins = new_bins;
bin_words = divide_and_ceil(new_bins, 64u);
return true;
}

Expand Down
8 changes: 5 additions & 3 deletions src/layout/hierarchical_binning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <hibf/layout/hierarchical_binning.hpp> // for hierarchical_binning
#include <hibf/layout/layout.hpp> // for layout
#include <hibf/layout/simple_binning.hpp> // for simple_binning
#include <hibf/misc/add_empty_bins.hpp> // for add_empty_bins
#include <hibf/misc/divide_and_ceil.hpp> // for divide_and_ceil
#include <hibf/misc/next_multiple_of_64.hpp> // for next_multiple_of_64
#include <hibf/misc/subtract_empty_bins.hpp> // for subtract_empty_bins
Expand Down Expand Up @@ -411,7 +412,7 @@ size_t hierarchical_binning::add_lower_level(data_store & libf_data) const
size_t const number_of_user_bins = libf_data.positions.size();

// now do the binning for the low-level IBF:
if (number_of_user_bins > config.tmax)
if (number_of_user_bins > subtract_empty_bins(config.tmax, config.empty_bin_fraction))
{
// recursively call hierarchical binning if there are still too many UBs
return hierarchical_binning{libf_data, config}.execute(); // return id of maximum technical bin
Expand All @@ -420,8 +421,9 @@ size_t hierarchical_binning::add_lower_level(data_store & libf_data) const
{
// use simple binning to distribute remaining UBs
// Simple binning is not bound by config.tmax
size_t const num_user_bins_next_64 = next_multiple_of_64(number_of_user_bins);
size_t const number_of_technical_bins = subtract_empty_bins(num_user_bins_next_64, config.empty_bin_fraction);
size_t const user_and_empty_bins = add_empty_bins(number_of_user_bins, config.empty_bin_fraction);
size_t const number_of_technical_bins =
subtract_empty_bins(next_multiple_of_64(user_and_empty_bins), config.empty_bin_fraction);
return simple_binning{libf_data, number_of_technical_bins}.execute(); // return id of maximum technical bin
}
}
Expand Down
47 changes: 44 additions & 3 deletions test/unit/hibf/config_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@
#include <string> // for allocator, char_traits, string
#include <utility> // for move

#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/test/cereal.hpp> // for test_serialisation
#include <hibf/test/expect_throw_msg.hpp> // for EXPECT_THROW_MSG
#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/misc/add_empty_bins.hpp> // for add_empty_bins
#include <hibf/misc/subtract_empty_bins.hpp> // for subtract_empty_bins
#include <hibf/test/cereal.hpp> // for test_serialisation
#include <hibf/test/expect_throw_msg.hpp> // for EXPECT_THROW_MSG

TEST(config_test, write_to)
{
Expand Down Expand Up @@ -405,6 +407,45 @@ TEST(config_test, validate_and_set_defaults)
}
}

TEST(config_test, empty_bin_fraction)
{
seqan::hibf::config config{.input_fn = [](size_t const, seqan::hibf::insert_iterator) {},
.number_of_user_bins = 1u};

for (size_t top_level_tmax = 64; top_level_tmax <= 1024; top_level_tmax += 64)
{
config.tmax = top_level_tmax;

// Some deviation for high fractions
for (double fraction = 0.00; fraction <= 0.97; fraction += 0.01)
{
size_t const top_level_subtracted = seqan::hibf::subtract_empty_bins(top_level_tmax, fraction);
double const adjusted_fraction = 1.0 - static_cast<double>(top_level_subtracted) / top_level_tmax;

config.empty_bin_fraction = fraction;
config.validate_and_set_defaults();
EXPECT_EQ(config.empty_bin_fraction, adjusted_fraction);

// Roundtrip test
for (size_t lower_level_tmax = 64; lower_level_tmax <= top_level_tmax; lower_level_tmax += 64)
{
size_t const lower_level_subtracted =
seqan::hibf::subtract_empty_bins(lower_level_tmax, adjusted_fraction);
size_t const lower_level_added = seqan::hibf::add_empty_bins(lower_level_subtracted, adjusted_fraction);

ASSERT_EQ(seqan::hibf::next_multiple_of_64(lower_level_added), lower_level_tmax)
<< "top_level_tmax: " << top_level_tmax << '\n'
<< "lower_level_tmax: " << lower_level_tmax << '\n'
<< "fraction: " << fraction << '\n'
<< "top_level_subtracted: " << top_level_subtracted << '\n'
<< "adjusted_fraction: " << adjusted_fraction << '\n'
<< "lower_level_subtracted: " << lower_level_subtracted << '\n'
<< "lower_level_added: " << lower_level_added << '\n';
}
}
}
}

TEST(config_test, serialisation)
{
seqan::hibf::config config{.number_of_user_bins = 123456789,
Expand Down
Loading