@@ -30,6 +30,131 @@ filter_buckets: std.ArrayListUnmanaged(CuckooFilter.Bucket),
3030trigram_to_declarations : std .AutoArrayHashMapUnmanaged (Trigram , std .ArrayListUnmanaged (Declaration .Index )),
3131declarations : std .MultiArrayList (Declaration ),
3232
33+ pub const TrigramIterator = struct {
34+ buffer : []const u8 ,
35+ index : usize ,
36+ boundary : Boundary ,
37+
38+ pub fn init (buffer : []const u8 ) TrigramIterator {
39+ assert (buffer .len != 0 );
40+ return .{ .buffer = buffer , .index = 0 , .boundary = .calculate (buffer , 0 ) };
41+ }
42+
43+ pub const Boundary = struct {
44+ end : usize ,
45+ next_start : ? usize ,
46+
47+ pub fn calculate (buffer : []const u8 , index : usize ) Boundary {
48+ assert (buffer [index .. ].len > 0 );
49+
50+ if (std .ascii .isLower (buffer [index ])) {
51+ // First character lowercase
52+ for (buffer [index + 1 .. ], index + 1.. ) | c , i | {
53+ if (! std .ascii .isLower (c )) {
54+ return .{
55+ .end = i ,
56+ .next_start = i ,
57+ };
58+ }
59+ }
60+ } else {
61+ if (index + 1 >= buffer .len ) {
62+ return .{
63+ .end = buffer .len ,
64+ .next_start = null ,
65+ };
66+ }
67+
68+ if (std .ascii .isLower (buffer [index + 1 ])) {
69+ // First char is uppercase, second char is lowercase
70+ for (buffer [index + 2 .. ], index + 2.. ) | c , i | {
71+ if (! std .ascii .isLower (c )) {
72+ return .{
73+ .end = i ,
74+ .next_start = i ,
75+ };
76+ }
77+ }
78+ } else {
79+ // First and second chars are uppercase
80+ for (buffer [index + 2 .. ], index + 2.. ) | c , i | {
81+ if (! std .ascii .isUpper (c )) {
82+ return .{
83+ .end = i ,
84+ .next_start = i ,
85+ };
86+ }
87+ }
88+ }
89+ }
90+
91+ return .{
92+ .end = buffer .len ,
93+ .next_start = null ,
94+ };
95+ }
96+ };
97+
98+ pub fn next (ti : * TrigramIterator ) ? Trigram {
99+ if (ti .index == ti .buffer .len ) return null ;
100+ assert (ti .index < ti .boundary .end );
101+
102+ var trigram : [3 ]u8 = @splat (0 );
103+ const unpadded = ti .buffer [ti .index .. @min (ti .index + 3 , ti .boundary .end )];
104+ _ = std .ascii .lowerString (& trigram , unpadded );
105+
106+ if (unpadded .len < 3 or ti .index + 3 >= ti .boundary .end ) {
107+ ti .index = ti .boundary .next_start orelse {
108+ ti .index = ti .buffer .len ;
109+ return trigram ;
110+ };
111+ ti .boundary = .calculate (ti .buffer , ti .index );
112+ } else {
113+ ti .index += 1 ;
114+ }
115+
116+ return trigram ;
117+ }
118+ };
119+
120+ test "TrigramIterator.Boundary.calculate" {
121+ var boundary : TrigramIterator.Boundary = .calculate ("helloWORLD" , 0 );
122+ try std .testing .expectEqual (5 , boundary .end );
123+ try std .testing .expectEqual (5 , boundary .next_start .? );
124+
125+ boundary = .calculate ("helloWORLD" , 5 );
126+ try std .testing .expectEqual (10 , boundary .end );
127+ try std .testing .expectEqual (null , boundary .next_start );
128+ }
129+
130+ test TrigramIterator {
131+ const allocator = std .testing .allocator ;
132+
133+ const matrix : []const struct { []const u8 , []const Trigram } = &.{
134+ .{ "a" , &.{"a\x00\x00 " .* } },
135+ .{ "ab" , &.{"ab\x00 " .* } },
136+ .{ "helloWORLD" , &.{ "hel" .* , "ell" .* , "llo" .* , "wor" .* , "orl" .* , "rld" .* } },
137+ .{ "HelloWORLD" , &.{ "hel" .* , "ell" .* , "llo" .* , "wor" .* , "orl" .* , "rld" .* } },
138+ .{ "HelloWorld" , &.{ "hel" .* , "ell" .* , "llo" .* , "wor" .* , "orl" .* , "rld" .* } },
139+ };
140+
141+ var actual : std .ArrayList (Trigram ) = .empty ;
142+ defer actual .deinit (allocator );
143+
144+ for (matrix ) | entry | {
145+ const input , const expected = entry ;
146+
147+ actual .clearRetainingCapacity ();
148+
149+ var it : TrigramIterator = .init (input );
150+ while (it .next ()) | trigram | {
151+ try actual .append (allocator , trigram );
152+ }
153+
154+ try @import ("testing.zig" ).expectEqual (expected , actual .items );
155+ }
156+ }
157+
33158pub fn init (
34159 allocator : std.mem.Allocator ,
35160 tree : * const Ast ,
0 commit comments