Skip to content

Commit 0357144

Browse files
Case-sensitive trigram iterator
Co-Authored-By: Techatrix <[email protected]>
1 parent 7dc9895 commit 0357144

File tree

1 file changed

+125
-0
lines changed

1 file changed

+125
-0
lines changed

src/TrigramStore.zig

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,131 @@ filter_buckets: std.ArrayListUnmanaged(CuckooFilter.Bucket),
3030
trigram_to_declarations: std.AutoArrayHashMapUnmanaged(Trigram, std.ArrayListUnmanaged(Declaration.Index)),
3131
declarations: std.MultiArrayList(Declaration),
3232

33+
pub const TrigramIterator = struct {
34+
buffer: []const u8,
35+
index: usize,
36+
boundary: Boundary,
37+
38+
pub fn init(buffer: []const u8) TrigramIterator {
39+
assert(buffer.len != 0);
40+
return .{ .buffer = buffer, .index = 0, .boundary = .calculate(buffer, 0) };
41+
}
42+
43+
pub const Boundary = struct {
44+
end: usize,
45+
next_start: ?usize,
46+
47+
pub fn calculate(buffer: []const u8, index: usize) Boundary {
48+
assert(buffer[index..].len > 0);
49+
50+
if (std.ascii.isLower(buffer[index])) {
51+
// First character lowercase
52+
for (buffer[index + 1 ..], index + 1..) |c, i| {
53+
if (!std.ascii.isLower(c)) {
54+
return .{
55+
.end = i,
56+
.next_start = i,
57+
};
58+
}
59+
}
60+
} else {
61+
if (index + 1 >= buffer.len) {
62+
return .{
63+
.end = buffer.len,
64+
.next_start = null,
65+
};
66+
}
67+
68+
if (std.ascii.isLower(buffer[index + 1])) {
69+
// First char is uppercase, second char is lowercase
70+
for (buffer[index + 2 ..], index + 2..) |c, i| {
71+
if (!std.ascii.isLower(c)) {
72+
return .{
73+
.end = i,
74+
.next_start = i,
75+
};
76+
}
77+
}
78+
} else {
79+
// First and second chars are uppercase
80+
for (buffer[index + 2 ..], index + 2..) |c, i| {
81+
if (!std.ascii.isUpper(c)) {
82+
return .{
83+
.end = i,
84+
.next_start = i,
85+
};
86+
}
87+
}
88+
}
89+
}
90+
91+
return .{
92+
.end = buffer.len,
93+
.next_start = null,
94+
};
95+
}
96+
};
97+
98+
pub fn next(ti: *TrigramIterator) ?Trigram {
99+
if (ti.index == ti.buffer.len) return null;
100+
assert(ti.index < ti.boundary.end);
101+
102+
var trigram: [3]u8 = @splat(0);
103+
const unpadded = ti.buffer[ti.index..@min(ti.index + 3, ti.boundary.end)];
104+
_ = std.ascii.lowerString(&trigram, unpadded);
105+
106+
if (unpadded.len < 3 or ti.index + 3 >= ti.boundary.end) {
107+
ti.index = ti.boundary.next_start orelse {
108+
ti.index = ti.buffer.len;
109+
return trigram;
110+
};
111+
ti.boundary = .calculate(ti.buffer, ti.index);
112+
} else {
113+
ti.index += 1;
114+
}
115+
116+
return trigram;
117+
}
118+
};
119+
120+
test "TrigramIterator.Boundary.calculate" {
121+
var boundary: TrigramIterator.Boundary = .calculate("helloWORLD", 0);
122+
try std.testing.expectEqual(5, boundary.end);
123+
try std.testing.expectEqual(5, boundary.next_start.?);
124+
125+
boundary = .calculate("helloWORLD", 5);
126+
try std.testing.expectEqual(10, boundary.end);
127+
try std.testing.expectEqual(null, boundary.next_start);
128+
}
129+
130+
test TrigramIterator {
131+
const allocator = std.testing.allocator;
132+
133+
const matrix: []const struct { []const u8, []const Trigram } = &.{
134+
.{ "a", &.{"a\x00\x00".*} },
135+
.{ "ab", &.{"ab\x00".*} },
136+
.{ "helloWORLD", &.{ "hel".*, "ell".*, "llo".*, "wor".*, "orl".*, "rld".* } },
137+
.{ "HelloWORLD", &.{ "hel".*, "ell".*, "llo".*, "wor".*, "orl".*, "rld".* } },
138+
.{ "HelloWorld", &.{ "hel".*, "ell".*, "llo".*, "wor".*, "orl".*, "rld".* } },
139+
};
140+
141+
var actual: std.ArrayList(Trigram) = .empty;
142+
defer actual.deinit(allocator);
143+
144+
for (matrix) |entry| {
145+
const input, const expected = entry;
146+
147+
actual.clearRetainingCapacity();
148+
149+
var it: TrigramIterator = .init(input);
150+
while (it.next()) |trigram| {
151+
try actual.append(allocator, trigram);
152+
}
153+
154+
try @import("testing.zig").expectEqual(expected, actual.items);
155+
}
156+
}
157+
33158
pub fn init(
34159
allocator: std.mem.Allocator,
35160
tree: *const Ast,

0 commit comments

Comments
 (0)