Skip to content

Commit 39fd986

Browse files
committed
Even/odd cuckoo
1 parent 6b9acfe commit 39fd986

File tree

1 file changed

+57
-44
lines changed

1 file changed

+57
-44
lines changed

src/TrigramStore.zig

Lines changed: 57 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ pub fn init(
148148
if (trigrams.len > 0) {
149149
var prng = std.Random.DefaultPrng.init(0);
150150

151-
const filter_capacity = CuckooFilter.capacityForCount(store.trigram_to_declarations.count()) catch unreachable;
151+
const filter_capacity = CuckooFilter.capacityForCount(@intCast(store.trigram_to_declarations.count())) catch unreachable;
152152
try store.filter_buckets.ensureTotalCapacityPrecise(allocator, filter_capacity);
153153
store.filter_buckets.items.len = filter_capacity;
154154

@@ -279,44 +279,53 @@ fn mergeIntersection(
279279
return out_idx;
280280
}
281281

282-
// TODO: The pow2 requirement is quite inefficient: explore ideas posted in
283-
// https://databasearchitects.blogspot.com/2019/07/cuckoo-filters-with-arbitrarily-sized.html
284-
// (rocksdb even-odd scheme from comments looks interesting).
282+
fn parity(integer: anytype) enum(u1) { even, odd } {
283+
return @enumFromInt(integer & 1);
284+
}
285+
285286
pub const CuckooFilter = struct {
286-
/// len must be a power of 2.
287-
///
288-
/// ### Pathological case with buckets.len power of 2
289-
///
290-
/// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_2`
291-
/// - `BucketIndex(alias_1)` -> `bucket_1`, `BucketIndex(alias_1).alternate()` -> `bucket_2`
292-
///
293-
/// Our alternate mappings hold and `contains()` will not return false negatives.
294-
///
295-
/// ### Pathological case with buckets.len NOT power of 2:
296-
///
297-
/// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_3`
298-
/// - `BucketIndex(alias_1)` -> `bucket_2`, `BucketIndex(alias_1).alternate()` -> `bucket_4`
299-
///
300-
/// Our alternate mappings do not hold and `contains()` can return false negatives. This is not
301-
/// acceptable as the entire point of an AMQ datastructure is the presence of false positives
302-
/// but not false negatives.
303287
buckets: []Bucket,
304288

305289
pub const Fingerprint = enum(u8) {
306290
none = std.math.maxInt(u8),
307291
_,
308292

309-
pub fn hash(fingerprint: Fingerprint) u32 {
310-
return @truncate(std.hash.Murmur2_64.hash(&.{@intFromEnum(fingerprint)}));
293+
const precomputed_odd_hashes = blk: {
294+
var table: [255]u32 = undefined;
295+
296+
for (&table, 0..) |*h, index| {
297+
h.* = @truncate(std.hash.Murmur2_64.hash(&.{index}) | 1);
298+
}
299+
300+
break :blk table;
301+
};
302+
303+
pub fn oddHash(fingerprint: Fingerprint) u32 {
304+
assert(fingerprint != .none);
305+
return precomputed_odd_hashes[@intFromEnum(fingerprint)];
311306
}
312307
};
308+
313309
pub const Bucket = [4]Fingerprint;
314310
pub const BucketIndex = enum(u32) {
315311
_,
316312

317-
pub fn alternate(index: BucketIndex, fingerprint: Fingerprint) BucketIndex {
313+
pub fn alternate(index: BucketIndex, fingerprint: Fingerprint, len: u32) BucketIndex {
314+
assert(@intFromEnum(index) < len);
318315
assert(fingerprint != .none);
319-
return @enumFromInt(@intFromEnum(index) ^ fingerprint.hash());
316+
317+
const signed_index: i64 = @intFromEnum(index);
318+
const odd_hash: i64 = fingerprint.oddHash();
319+
320+
const unbounded = switch (parity(signed_index)) {
321+
.even => signed_index + odd_hash,
322+
.odd => signed_index - odd_hash,
323+
};
324+
const bounded: u32 = @intCast(@mod(unbounded, len));
325+
326+
assert(parity(signed_index) != parity(bounded));
327+
328+
return @enumFromInt(bounded);
320329
}
321330
};
322331

@@ -325,41 +334,46 @@ pub const CuckooFilter = struct {
325334
index_1: BucketIndex,
326335
index_2: BucketIndex,
327336

328-
pub fn initFromTrigram(trigram: Trigram) Triplet {
337+
pub fn initFromTrigram(trigram: Trigram, len: u32) Triplet {
329338
const split: packed struct {
330339
fingerprint: Fingerprint,
331340
padding: u24,
332-
index_1: BucketIndex,
341+
index_1: u32,
333342
} = @bitCast(std.hash.Murmur2_64.hash(&trigram));
334343

344+
const index_1: BucketIndex = @enumFromInt(split.index_1 % len);
345+
335346
const fingerprint: Fingerprint = if (split.fingerprint == .none)
336-
@enumFromInt(0)
347+
@enumFromInt(1)
337348
else
338349
split.fingerprint;
339350

340351
const triplet: Triplet = .{
341352
.fingerprint = fingerprint,
342-
.index_1 = split.index_1,
343-
.index_2 = split.index_1.alternate(fingerprint),
353+
.index_1 = index_1,
354+
.index_2 = index_1.alternate(fingerprint, len),
344355
};
345-
assert(triplet.index_2.alternate(fingerprint) == triplet.index_1);
356+
assert(triplet.index_2.alternate(fingerprint, len) == index_1);
346357

347358
return triplet;
348359
}
349360
};
350361

362+
pub fn init(buckets: []Bucket) CuckooFilter {
363+
assert(parity(buckets.len) == .even);
364+
return .{ .buckets = buckets };
365+
}
366+
351367
pub fn reset(filter: CuckooFilter) void {
352-
@memset(filter.buckets, [1]Fingerprint{.none} ** 4);
368+
@memset(filter.buckets, [1]Fingerprint{.none} ** @typeInfo(Bucket).array.len);
353369
}
354370

355-
pub fn capacityForCount(count: usize) error{Overflow}!usize {
356-
const fill_rate = 0.95;
357-
return try std.math.ceilPowerOfTwo(usize, @intFromFloat(@ceil(@as(f32, @floatFromInt(count)) / fill_rate)));
371+
pub fn capacityForCount(count: u32) error{Overflow}!u32 {
372+
return count + (count & 1);
358373
}
359374

360-
// Use a hash (fnv) for randomness.
361375
pub fn append(filter: CuckooFilter, random: std.Random, trigram: Trigram) error{EvictionFailed}!void {
362-
const triplet: Triplet = .initFromTrigram(trigram);
376+
const triplet: Triplet = .initFromTrigram(trigram, @intCast(filter.buckets.len));
363377

364378
if (filter.appendToBucket(triplet.index_1, triplet.fingerprint) or
365379
filter.appendToBucket(triplet.index_2, triplet.fingerprint))
@@ -371,7 +385,7 @@ pub const CuckooFilter = struct {
371385
var index = if (random.boolean()) triplet.index_1 else triplet.index_2;
372386
for (0..500) |_| {
373387
fingerprint = filter.swapFromBucket(random, index, fingerprint);
374-
index = index.alternate(fingerprint);
388+
index = index.alternate(fingerprint, @intCast(filter.buckets.len));
375389

376390
if (filter.appendToBucket(index, fingerprint)) {
377391
return;
@@ -382,8 +396,7 @@ pub const CuckooFilter = struct {
382396
}
383397

384398
fn bucketAt(filter: CuckooFilter, index: BucketIndex) *Bucket {
385-
assert(std.math.isPowerOfTwo(filter.buckets.len));
386-
return &filter.buckets[@intFromEnum(index) & (filter.buckets.len - 1)];
399+
return &filter.buckets[@intFromEnum(index)];
387400
}
388401

389402
fn appendToBucket(filter: CuckooFilter, index: BucketIndex, fingerprint: Fingerprint) bool {
@@ -408,6 +421,7 @@ pub const CuckooFilter = struct {
408421
) Fingerprint {
409422
assert(fingerprint != .none);
410423

424+
comptime assert(@typeInfo(Bucket).array.len == 4);
411425
const target = &filter.bucketAt(index)[random.int(u2)];
412426

413427
const old_fingerprint = target.*;
@@ -419,7 +433,7 @@ pub const CuckooFilter = struct {
419433
}
420434

421435
pub fn contains(filter: CuckooFilter, trigram: Trigram) bool {
422-
const triplet: Triplet = .initFromTrigram(trigram);
436+
const triplet: Triplet = .initFromTrigram(trigram, @intCast(filter.buckets.len));
423437

424438
return filter.containsInBucket(triplet.index_1, triplet.fingerprint) or
425439
filter.containsInBucket(triplet.index_2, triplet.fingerprint);
@@ -443,16 +457,15 @@ pub const CuckooFilter = struct {
443457
test CuckooFilter {
444458
const allocator = std.testing.allocator;
445459

446-
const element_count = 486;
460+
const element_count = 499;
447461
const filter_size = comptime CuckooFilter.capacityForCount(element_count) catch unreachable;
448-
try std.testing.expectEqual(512, filter_size);
449462

450463
var entries: std.AutoArrayHashMapUnmanaged(Trigram, void) = .empty;
451464
defer entries.deinit(allocator);
452465
try entries.ensureTotalCapacity(allocator, element_count);
453466

454467
var buckets: [filter_size]CuckooFilter.Bucket = undefined;
455-
var filter: CuckooFilter = .{ .buckets = &buckets };
468+
var filter: CuckooFilter = .init(&buckets);
456469
var filter_prng: std.Random.DefaultPrng = .init(42);
457470

458471
for (0..2_500) |gen_prng_seed| {

0 commit comments

Comments
 (0)