@@ -148,7 +148,7 @@ pub fn init(
148148 if (trigrams .len > 0 ) {
149149 var prng = std .Random .DefaultPrng .init (0 );
150150
151- const filter_capacity = CuckooFilter .capacityForCount (store .trigram_to_declarations .count ()) catch unreachable ;
151+ const filter_capacity = CuckooFilter .capacityForCount (@intCast ( store .trigram_to_declarations .count () )) catch unreachable ;
152152 try store .filter_buckets .ensureTotalCapacityPrecise (allocator , filter_capacity );
153153 store .filter_buckets .items .len = filter_capacity ;
154154
@@ -279,44 +279,53 @@ fn mergeIntersection(
279279 return out_idx ;
280280}
281281
282- // TODO: The pow2 requirement is quite inefficient: explore ideas posted in
283- // https://databasearchitects.blogspot.com/2019/07/cuckoo-filters-with-arbitrarily-sized.html
284- // (rocksdb even-odd scheme from comments looks interesting).
282+ fn parity (integer : anytype ) enum (u1 ) { even , odd } {
283+ return @enumFromInt (integer & 1 );
284+ }
285+
285286pub const CuckooFilter = struct {
286- /// len must be a power of 2.
287- ///
288- /// ### Pathological case with buckets.len power of 2
289- ///
290- /// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_2`
291- /// - `BucketIndex(alias_1)` -> `bucket_1`, `BucketIndex(alias_1).alternate()` -> `bucket_2`
292- ///
293- /// Our alternate mappings hold and `contains()` will not return false negatives.
294- ///
295- /// ### Pathological case with buckets.len NOT power of 2:
296- ///
297- /// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_3`
298- /// - `BucketIndex(alias_1)` -> `bucket_2`, `BucketIndex(alias_1).alternate()` -> `bucket_4`
299- ///
300- /// Our alternate mappings do not hold and `contains()` can return false negatives. This is not
301- /// acceptable as the entire point of an AMQ datastructure is the presence of false positives
302- /// but not false negatives.
303287 buckets : []Bucket ,
304288
305289 pub const Fingerprint = enum (u8 ) {
306290 none = std .math .maxInt (u8 ),
307291 _ ,
308292
309- pub fn hash (fingerprint : Fingerprint ) u32 {
310- return @truncate (std .hash .Murmur2_64 .hash (&.{@intFromEnum (fingerprint )}));
293+ const precomputed_odd_hashes = blk : {
294+ var table : [255 ]u32 = undefined ;
295+
296+ for (& table , 0.. ) | * h , index | {
297+ h .* = @truncate (std .hash .Murmur2_64 .hash (&.{index }) | 1 );
298+ }
299+
300+ break :blk table ;
301+ };
302+
303+ pub fn oddHash (fingerprint : Fingerprint ) u32 {
304+ assert (fingerprint != .none );
305+ return precomputed_odd_hashes [@intFromEnum (fingerprint )];
311306 }
312307 };
308+
313309 pub const Bucket = [4 ]Fingerprint ;
314310 pub const BucketIndex = enum (u32 ) {
315311 _ ,
316312
317- pub fn alternate (index : BucketIndex , fingerprint : Fingerprint ) BucketIndex {
313+ pub fn alternate (index : BucketIndex , fingerprint : Fingerprint , len : u32 ) BucketIndex {
314+ assert (@intFromEnum (index ) < len );
318315 assert (fingerprint != .none );
319- return @enumFromInt (@intFromEnum (index ) ^ fingerprint .hash ());
316+
317+ const signed_index : i64 = @intFromEnum (index );
318+ const odd_hash : i64 = fingerprint .oddHash ();
319+
320+ const unbounded = switch (parity (signed_index )) {
321+ .even = > signed_index + odd_hash ,
322+ .odd = > signed_index - odd_hash ,
323+ };
324+ const bounded : u32 = @intCast (@mod (unbounded , len ));
325+
326+ assert (parity (signed_index ) != parity (bounded ));
327+
328+ return @enumFromInt (bounded );
320329 }
321330 };
322331
@@ -325,41 +334,46 @@ pub const CuckooFilter = struct {
325334 index_1 : BucketIndex ,
326335 index_2 : BucketIndex ,
327336
328- pub fn initFromTrigram (trigram : Trigram ) Triplet {
337+ pub fn initFromTrigram (trigram : Trigram , len : u32 ) Triplet {
329338 const split : packed struct {
330339 fingerprint : Fingerprint ,
331340 padding : u24 ,
332- index_1 : BucketIndex ,
341+ index_1 : u32 ,
333342 } = @bitCast (std .hash .Murmur2_64 .hash (& trigram ));
334343
344+ const index_1 : BucketIndex = @enumFromInt (split .index_1 % len );
345+
335346 const fingerprint : Fingerprint = if (split .fingerprint == .none )
336- @enumFromInt (0 )
347+ @enumFromInt (1 )
337348 else
338349 split .fingerprint ;
339350
340351 const triplet : Triplet = .{
341352 .fingerprint = fingerprint ,
342- .index_1 = split . index_1 ,
343- .index_2 = split . index_1 .alternate (fingerprint ),
353+ .index_1 = index_1 ,
354+ .index_2 = index_1 .alternate (fingerprint , len ),
344355 };
345- assert (triplet .index_2 .alternate (fingerprint ) == triplet . index_1 );
356+ assert (triplet .index_2 .alternate (fingerprint , len ) == index_1 );
346357
347358 return triplet ;
348359 }
349360 };
350361
362+ pub fn init (buckets : []Bucket ) CuckooFilter {
363+ assert (parity (buckets .len ) == .even );
364+ return .{ .buckets = buckets };
365+ }
366+
351367 pub fn reset (filter : CuckooFilter ) void {
352- @memset (filter .buckets , [1 ]Fingerprint {.none } ** 4 );
368+ @memset (filter .buckets , [1 ]Fingerprint {.none } ** @typeInfo ( Bucket ). array . len );
353369 }
354370
355- pub fn capacityForCount (count : usize ) error {Overflow }! usize {
356- const fill_rate = 0.95 ;
357- return try std .math .ceilPowerOfTwo (usize , @intFromFloat (@ceil (@as (f32 , @floatFromInt (count )) / fill_rate )));
371+ pub fn capacityForCount (count : u32 ) error {Overflow }! u32 {
372+ return count + (count & 1 );
358373 }
359374
360- // Use a hash (fnv) for randomness.
361375 pub fn append (filter : CuckooFilter , random : std.Random , trigram : Trigram ) error {EvictionFailed }! void {
362- const triplet : Triplet = .initFromTrigram (trigram );
376+ const triplet : Triplet = .initFromTrigram (trigram , @intCast ( filter . buckets . len ) );
363377
364378 if (filter .appendToBucket (triplet .index_1 , triplet .fingerprint ) or
365379 filter .appendToBucket (triplet .index_2 , triplet .fingerprint ))
@@ -371,7 +385,7 @@ pub const CuckooFilter = struct {
371385 var index = if (random .boolean ()) triplet .index_1 else triplet .index_2 ;
372386 for (0.. 500) | _ | {
373387 fingerprint = filter .swapFromBucket (random , index , fingerprint );
374- index = index .alternate (fingerprint );
388+ index = index .alternate (fingerprint , @intCast ( filter . buckets . len ) );
375389
376390 if (filter .appendToBucket (index , fingerprint )) {
377391 return ;
@@ -382,8 +396,7 @@ pub const CuckooFilter = struct {
382396 }
383397
384398 fn bucketAt (filter : CuckooFilter , index : BucketIndex ) * Bucket {
385- assert (std .math .isPowerOfTwo (filter .buckets .len ));
386- return & filter .buckets [@intFromEnum (index ) & (filter .buckets .len - 1 )];
399+ return & filter .buckets [@intFromEnum (index )];
387400 }
388401
389402 fn appendToBucket (filter : CuckooFilter , index : BucketIndex , fingerprint : Fingerprint ) bool {
@@ -408,6 +421,7 @@ pub const CuckooFilter = struct {
408421 ) Fingerprint {
409422 assert (fingerprint != .none );
410423
424+ comptime assert (@typeInfo (Bucket ).array .len == 4 );
411425 const target = & filter .bucketAt (index )[random .int (u2 )];
412426
413427 const old_fingerprint = target .* ;
@@ -419,7 +433,7 @@ pub const CuckooFilter = struct {
419433 }
420434
421435 pub fn contains (filter : CuckooFilter , trigram : Trigram ) bool {
422- const triplet : Triplet = .initFromTrigram (trigram );
436+ const triplet : Triplet = .initFromTrigram (trigram , @intCast ( filter . buckets . len ) );
423437
424438 return filter .containsInBucket (triplet .index_1 , triplet .fingerprint ) or
425439 filter .containsInBucket (triplet .index_2 , triplet .fingerprint );
@@ -443,16 +457,15 @@ pub const CuckooFilter = struct {
443457test CuckooFilter {
444458 const allocator = std .testing .allocator ;
445459
446- const element_count = 486 ;
460+ const element_count = 499 ;
447461 const filter_size = comptime CuckooFilter .capacityForCount (element_count ) catch unreachable ;
448- try std .testing .expectEqual (512 , filter_size );
449462
450463 var entries : std .AutoArrayHashMapUnmanaged (Trigram , void ) = .empty ;
451464 defer entries .deinit (allocator );
452465 try entries .ensureTotalCapacity (allocator , element_count );
453466
454467 var buckets : [filter_size ]CuckooFilter.Bucket = undefined ;
455- var filter : CuckooFilter = .{ . buckets = & buckets } ;
468+ var filter : CuckooFilter = .init ( & buckets ) ;
456469 var filter_prng : std.Random.DefaultPrng = .init (42 );
457470
458471 for (0.. 2_500) | gen_prng_seed | {
0 commit comments