From 3a66de938a273383058e677a1d55fc516c82bdd4 Mon Sep 17 00:00:00 2001 From: Paulius Klyvis Date: Wed, 7 Dec 2016 11:25:50 +0100 Subject: [PATCH 1/2] set correct count after unioning two bloom filters --- pybloom/pybloom.py | 5 +++++ pybloom/tests.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/pybloom/pybloom.py b/pybloom/pybloom.py index beeefe4..568ad5a 100644 --- a/pybloom/pybloom.py +++ b/pybloom/pybloom.py @@ -222,6 +222,11 @@ def union(self, other): both the same capacity and error rate") new_bloom = self.copy() new_bloom.bitarray = new_bloom.bitarray | other.bitarray + # Set the new count + # https://en.wikipedia.org/wiki/Bloom_filter#The_union_and_intersection_of_sets + #new_bloom.count = int(round(-(float(self.num_bits) / float(self.num_slices)) * math.log(1 - (float(new_bloom.bitarray.count(1)) / float(self.num_bits))), 0)) + new_bloom.count = int(round(-float(self.bits_per_slice) * math.log( + 1 - (float(new_bloom.bitarray.count(1)) / float(self.num_bits))), 0)) return new_bloom def __or__(self, other): diff --git a/pybloom/tests.py b/pybloom/tests.py index 13d9b7d..17c8aa0 100644 --- a/pybloom/tests.py +++ b/pybloom/tests.py @@ -35,6 +35,36 @@ def test_union(self): for char in chars: self.assertTrue(char in new_bloom) + def test_union_size(self): + fpr = 0.001 + # False positive rate with small numbers is high, therefore let's test with bigger sets + bloom_one = BloomFilter(100000, fpr) + bloom_two = BloomFilter(100000, fpr) + listA = [str(random.getrandbits(8)) for i in range(10000)] + listB = [str(random.getrandbits(8)) for i in range(10000)] + + for char in listA: + bloom_one.add(char) + for char in listB: + bloom_two.add(char) + + merged_bloom = bloom_one.union(bloom_two) + + bloom_one_count = bloom_one.count + bloom_two_count = bloom_two.count + + listA_uniq_count = len(set(listA)) + listB_uniq_count = len(set(listB)) + + merged_bloom_count = merged_bloom.count + listAB_uniq_count = len(set(listA).union(set(listB))) + + assert bloom_one_count == listA_uniq_count + assert bloom_two_count == listB_uniq_count + assert (listAB_uniq_count * (1 - fpr) <= merged_bloom_count <= listAB_uniq_count * (1 + fpr)) + + + def test_intersection(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) From a56fcf1e6d6f6d1c953381094237ba87fe61d904 Mon Sep 17 00:00:00 2001 From: Paulius Klyvis Date: Mon, 12 Dec 2016 16:52:05 +0100 Subject: [PATCH 2/2] bloom filter intersection correct count --- pybloom/pybloom.py | 11 ++++++++--- pybloom/tests.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/pybloom/pybloom.py b/pybloom/pybloom.py index 568ad5a..eb02291 100644 --- a/pybloom/pybloom.py +++ b/pybloom/pybloom.py @@ -224,9 +224,8 @@ def union(self, other): new_bloom.bitarray = new_bloom.bitarray | other.bitarray # Set the new count # https://en.wikipedia.org/wiki/Bloom_filter#The_union_and_intersection_of_sets - #new_bloom.count = int(round(-(float(self.num_bits) / float(self.num_slices)) * math.log(1 - (float(new_bloom.bitarray.count(1)) / float(self.num_bits))), 0)) - new_bloom.count = int(round(-float(self.bits_per_slice) * math.log( - 1 - (float(new_bloom.bitarray.count(1)) / float(self.num_bits))), 0)) + new_bloom.count = -float(self.bits_per_slice) * math.log( + 1 - (float(new_bloom.bitarray.count(1)) / float(self.num_bits))) return new_bloom def __or__(self, other): @@ -241,6 +240,12 @@ def intersection(self, other): have equal capacity and error rate") new_bloom = self.copy() new_bloom.bitarray = new_bloom.bitarray & other.bitarray + # Set the new count + # https://en.wikipedia.org/wiki/Bloom_filter#The_union_and_intersection_of_sets + # The FPR in the resulting Bloom filter may be larger than the false positive probability in the Bloom filter created from scratch using the intersection of the two set + # Intersection guarantees to have all elements of the intersection but the false positive rate might be slightly higher than that of the pure intersection: + new_bloom.count = self.count + other.count + float(self.bits_per_slice) * math.log( + 1 - (float((self.copy() | other).bitarray.count(1)) / float(self.num_bits))) return new_bloom def __and__(self, other): diff --git a/pybloom/tests.py b/pybloom/tests.py index 17c8aa0..8ec1712 100644 --- a/pybloom/tests.py +++ b/pybloom/tests.py @@ -63,6 +63,35 @@ def test_union_size(self): assert bloom_two_count == listB_uniq_count assert (listAB_uniq_count * (1 - fpr) <= merged_bloom_count <= listAB_uniq_count * (1 + fpr)) + def test_intersection_size(self): + fpr = 0.001 + # False positive rate with small numbers is high, therefore let's test with bigger sets + bloom_one = BloomFilter(100000, fpr) + bloom_two = BloomFilter(100000, fpr) + listA = [str(random.getrandbits(14)) for i in range(71000)] + listB = [str(random.getrandbits(12)) for i in range(69000)] + + for char in listA: + bloom_one.add(char) + for char in listB: + bloom_two.add(char) + + merged_bloom = bloom_one.intersection(bloom_two) + + bloom_one_count = bloom_one.count + bloom_two_count = bloom_two.count + + listA_uniq_count = len(set(listA)) + listB_uniq_count = len(set(listB)) + + merged_bloom_count = merged_bloom.count + listAB_uniq_count = len(set(listA).intersection(set(listB))) + + assert bloom_one_count == listA_uniq_count + assert bloom_two_count == listB_uniq_count + # Intersection guarantees to have all elements of the intersection but the false positive rate might be slightly higher than that of the pure intersection: + assert (listAB_uniq_count * (1 - 2 * fpr) <= merged_bloom_count <= listAB_uniq_count * (1 + 2 * fpr)) + def test_intersection(self):