Skip to content

Commit c130207

Browse files
committed
Multibyte bugfixes
Incorrect boundary conditions now corrected.
1 parent 1ad1d12 commit c130207

File tree

3 files changed

+23
-9
lines changed

3 files changed

+23
-9
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ The provided Regex type allows 64 'operations' and 8 unique ASCII character sets
1111
Drop the file into your project, or use the Zig build system:
1212

1313
```zig
14-
zig fetch --save "https://github.com/mnemnion/mvzr/archive/refs/tags/v0.3.0.tar.gz"
14+
zig fetch --save "https://github.com/mnemnion/mvzr/archive/refs/tags/v0.3.1.tar.gz"
1515
```
1616

1717
I'll do my best to keep that URL fresh, but it pays to check over here: ➔

build.zig.zon

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
// This is a [Semantic Version](https://semver.org/).
99
// In a future version of Zig it will be used for package deduplication.
10-
.version = "0.3.0",
10+
.version = "0.3.1",
1111

1212
// This field is optional.
1313
// This is currently advisory only; Zig does not yet do anything

src/mvzr.zig

+21-7
Original file line numberDiff line numberDiff line change
@@ -1195,18 +1195,17 @@ fn prefixModifier(patt: []RegOp, j: usize, op: RegOp) !usize {
11951195
// Try to detect multi-byte characters
11961196
switch (patt[find_j]) {
11971197
.char => |c| {
1198-
if (0x80 <= c and c <= 0x9f) {
1199-
// Group a multi-byte.
1198+
if (0x80 <= c and c <= 0xbf) {
1199+
// Go back to lead byte:
12001200
while (find_j > 0 and
12011201
patt[find_j] == .char and
12021202
0x80 <= patt[find_j].char and
1203-
patt[find_j].char <= 0x9f) : (find_j -= 1)
1204-
{} // Move forward by two
1205-
if (find_j > 0) find_j -= 1;
1203+
patt[find_j].char <= 0xbf) : (find_j -= 1)
1204+
{}
12061205
std.mem.copyBackwards(RegOp, patt[find_j + 2 ..], patt[find_j .. j + 1]);
12071206
patt[find_j] = op;
12081207
patt[find_j + 1] = .left;
1209-
patt[j + 1] = .right;
1208+
patt[j + 2] = .right;
12101209
return 2;
12111210
}
12121211
},
@@ -1461,6 +1460,7 @@ fn compileRegex(RegexT: type, in: []const u8) ?RegexT {
14611460
}
14621461
const d1, const c1 = parseByte(in[i..]) catch {
14631462
// This is fine, literal `}`
1463+
// TODO: is it?
14641464
patt[j] = RegOp{ .char = '}' };
14651465
continue :dispatch;
14661466
};
@@ -2020,7 +2020,17 @@ fn printPatternInternal(patt: []const RegOp) ?u8 {
20202020
switch (patt[j]) {
20212021
.char,
20222022
=> |op| {
2023-
std.debug.print("{s} {u}", .{ @tagName(patt[j]), op });
2023+
switch (op) {
2024+
0...0x3f => {
2025+
std.debug.print("char 0x{x:0>2}", .{op});
2026+
},
2027+
0x40...0x7e => {
2028+
std.debug.print("char '{u}'", .{op});
2029+
},
2030+
0x7f...0xff => {
2031+
std.debug.print("char 0x{x:0>2}", .{op});
2032+
},
2033+
}
20242034
},
20252035
.some,
20262036
.up_to,
@@ -2411,3 +2421,7 @@ test "Multibyte continues" {
24112421
test "Uppercase Greek" {
24122422
try testMatchAll("(\\xce[\\x91-\\xa9])+", "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ");
24132423
}
2424+
2425+
test "M of N multibyte" {
2426+
try testMatchEnd("abλ{3,5}", "abλλλλ");
2427+
}

0 commit comments

Comments
 (0)