Skip to content

Commit f1bd5b0

Browse files
committed
feat: whitespace optimizer
1 parent fc7be52 commit f1bd5b0

File tree

2 files changed

+321
-0
lines changed

2 files changed

+321
-0
lines changed
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#[macro_use]
2+
extern crate criterion;
3+
4+
use criterion::{Criterion, Throughput};
5+
use tokenizers::pre_tokenizers::whitespace::{Whitespace, WhitespaceOptimized};
6+
use tokenizers::{OffsetReferential, OffsetType, PreTokenizer, PreTokenizedString};
7+
8+
fn bench_whitespace_comparison(c: &mut Criterion) {
9+
let mut group = c.benchmark_group("whitespace-pre-tokenizers");
10+
11+
// Test data with various characteristics
12+
let test_cases = vec![
13+
("simple", "Hello world! How are you doing?"),
14+
("mixed", "This is a test with numbers 123 and symbols @#$% and unicode: café résumé"),
15+
("whitespace_heavy", "Multiple spaces\tand\nnewlines\r\nhere"),
16+
("symbol_heavy", "Hello!@#$%^&*()world?><>{}[]|\\"),
17+
("word_heavy", "This is a very long sentence with many words that should be tokenized properly"),
18+
("unicode_heavy", "αβγ δέζ ηθι κλμ νξο πρσ τυφ χψω"),
19+
("mixed_unicode", "Hello 123 αβγ !@# world δέζ ηθι"),
20+
];
21+
22+
for (name, text) in test_cases {
23+
let data_len = text.len() as u64;
24+
group.throughput(Throughput::Bytes(data_len));
25+
26+
// Benchmark original regex-based implementation
27+
group.bench_function(&format!("{}-original", name), |b| {
28+
b.iter(|| {
29+
let mut pretokenized = PreTokenizedString::from(text);
30+
let pretok = Whitespace {};
31+
pretok.pre_tokenize(&mut pretokenized).unwrap();
32+
let _result = pretokenized
33+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
34+
.into_iter()
35+
.map(|(s, o, _)| (s, o))
36+
.collect::<Vec<_>>();
37+
})
38+
});
39+
40+
// Benchmark optimized byte-level implementation
41+
group.bench_function(&format!("{}-optimized", name), |b| {
42+
b.iter(|| {
43+
let mut pretokenized = PreTokenizedString::from(text);
44+
let pretok = WhitespaceOptimized {};
45+
pretok.pre_tokenize(&mut pretokenized).unwrap();
46+
let _result = pretokenized
47+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
48+
.into_iter()
49+
.map(|(s, o, _)| (s, o))
50+
.collect::<Vec<_>>();
51+
})
52+
});
53+
}
54+
55+
group.finish();
56+
}
57+
58+
fn bench_large_text(c: &mut Criterion) {
59+
let mut group = c.benchmark_group("whitespace-large-text");
60+
61+
// Create a large text by repeating patterns
62+
let base_text = "Hello world! This is a test with numbers 123 and symbols @#$% and unicode: café résumé. ";
63+
let large_text: String = base_text.repeat(1000); // ~50KB of text
64+
let data_len = large_text.len() as u64;
65+
66+
group.throughput(Throughput::Bytes(data_len));
67+
68+
group.bench_function("large-original", |b| {
69+
b.iter(|| {
70+
let mut pretokenized = PreTokenizedString::from(large_text.as_str());
71+
let pretok = Whitespace {};
72+
pretok.pre_tokenize(&mut pretokenized).unwrap();
73+
let _result = pretokenized
74+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
75+
.into_iter()
76+
.map(|(s, o, _)| (s, o))
77+
.collect::<Vec<_>>();
78+
})
79+
});
80+
81+
group.bench_function("large-optimized", |b| {
82+
b.iter(|| {
83+
let mut pretokenized = PreTokenizedString::from(large_text.as_str());
84+
let pretok = WhitespaceOptimized {};
85+
pretok.pre_tokenize(&mut pretokenized).unwrap();
86+
let _result = pretokenized
87+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
88+
.into_iter()
89+
.map(|(s, o, _)| (s, o))
90+
.collect::<Vec<_>>();
91+
})
92+
});
93+
94+
group.finish();
95+
}
96+
97+
criterion_group! {
98+
name = whitespace_benches;
99+
config = Criterion::default().sample_size(20);
100+
targets = bench_whitespace_comparison, bench_large_text
101+
}
102+
103+
criterion_main!(whitespace_benches);

tokenizers/src/pre_tokenizers/whitespace.rs

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,141 @@ impl PreTokenizer for Whitespace {
2828
}
2929
}
3030

31+
/// Optimized whitespace pre-tokenizer that uses byte-level scanning instead of regex.
32+
/// This provides better performance but may have slightly different behavior in edge cases
33+
/// compared to the regex-based implementation.
34+
#[derive(Clone, Debug, PartialEq, Eq)]
35+
#[macro_rules_attribute(impl_serde_type!)]
36+
pub struct WhitespaceOptimized;
37+
38+
impl Default for WhitespaceOptimized {
39+
fn default() -> Self {
40+
Self
41+
}
42+
}
43+
44+
impl PreTokenizer for WhitespaceOptimized {
45+
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
46+
pretokenized.split(|_, normalized| {
47+
normalized.split(Invert(WhitespacePattern), SplitDelimiterBehavior::Removed)
48+
})
49+
}
50+
}
51+
52+
/// Custom pattern implementation for optimized whitespace splitting
53+
/// This implements the equivalent of the regex r"\w+|[^\w\s]+" but with manual byte scanning
54+
struct WhitespacePattern;
55+
56+
impl crate::tokenizer::pattern::Pattern for WhitespacePattern {
57+
fn find_matches(&self, inside: &str) -> Result<Vec<(crate::Offsets, bool)>> {
58+
if inside.is_empty() {
59+
return Ok(vec![((0, 0), false)]);
60+
}
61+
62+
let mut matches = Vec::new();
63+
let mut current_start = 0;
64+
let mut current_end = 0;
65+
let mut current_type = None; // None = whitespace, Some(true) = word, Some(false) = symbol
66+
67+
let mut i = 0;
68+
while i < inside.len() {
69+
let char_start = inside[i..].chars().next().unwrap();
70+
let char_len = char_start.len_utf8();
71+
72+
let is_whitespace = char_start.is_whitespace();
73+
let is_word_char = char_start.is_alphanumeric() || char_start == '_';
74+
let is_symbol = !is_whitespace && !is_word_char;
75+
76+
match (current_type, is_whitespace, is_word_char, is_symbol) {
77+
(None, true, _, _) => {
78+
// Continue in whitespace
79+
i += char_len;
80+
}
81+
(None, false, true, _) => {
82+
// Transition from whitespace to word
83+
current_start = i;
84+
current_end = i + char_len;
85+
current_type = Some(true);
86+
i += char_len;
87+
}
88+
(None, false, false, true) => {
89+
// Transition from whitespace to symbol
90+
current_start = i;
91+
current_end = i + char_len;
92+
current_type = Some(false);
93+
i += char_len;
94+
}
95+
(None, false, false, false) => {
96+
// This shouldn't happen since a char is either whitespace, word, or symbol
97+
// But handle it gracefully by treating as symbol
98+
current_start = i;
99+
current_end = i + char_len;
100+
current_type = Some(false);
101+
i += char_len;
102+
}
103+
(Some(true), true, _, _) => {
104+
// Transition from word to whitespace - finish word
105+
matches.push(((current_start, current_end), true));
106+
current_type = None;
107+
i += char_len;
108+
}
109+
(Some(true), false, true, _) => {
110+
// Continue in word
111+
current_end = i + char_len;
112+
i += char_len;
113+
}
114+
(Some(true), false, false, true) => {
115+
// Transition from word to symbol - finish word, start symbol
116+
matches.push(((current_start, current_end), true));
117+
current_start = i;
118+
current_end = i + char_len;
119+
current_type = Some(false);
120+
i += char_len;
121+
}
122+
(Some(true), false, false, false) => {
123+
// This shouldn't happen, but handle as symbol
124+
matches.push(((current_start, current_end), true));
125+
current_start = i;
126+
current_end = i + char_len;
127+
current_type = Some(false);
128+
i += char_len;
129+
}
130+
(Some(false), true, _, _) => {
131+
// Transition from symbol to whitespace - finish symbol
132+
matches.push(((current_start, current_end), true));
133+
current_type = None;
134+
i += char_len;
135+
}
136+
(Some(false), false, true, _) => {
137+
// Transition from symbol to word - finish symbol, start word
138+
matches.push(((current_start, current_end), true));
139+
current_start = i;
140+
current_end = i + char_len;
141+
current_type = Some(true);
142+
i += char_len;
143+
}
144+
(Some(false), false, false, true) => {
145+
// Continue in symbol
146+
current_end = i + char_len;
147+
i += char_len;
148+
}
149+
(Some(false), false, false, false) => {
150+
// This shouldn't happen, but handle as symbol
151+
current_end = i + char_len;
152+
i += char_len;
153+
}
154+
}
155+
}
156+
157+
// Don't forget the last token
158+
if let Some(_) = current_type {
159+
matches.push(((current_start, current_end), true));
160+
}
161+
162+
Ok(matches)
163+
}
164+
}
165+
31166
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
32167
#[macro_rules_attribute(impl_serde_type!)]
33168
pub struct WhitespaceSplit;
@@ -102,4 +237,87 @@ mod tests {
102237
);
103238
}
104239
}
240+
241+
#[test]
242+
fn optimized_compatibility() {
243+
// Test that the optimized version produces the same results as the original
244+
let test_cases = vec![
245+
"Hello world!",
246+
"How are you doing?",
247+
"This is a test with numbers 123 and symbols @#$%",
248+
"Multiple spaces",
249+
"Tabs\tand\nnewlines",
250+
"Unicode: café résumé naïve",
251+
"Mixed: Hello123!@# world",
252+
"Edge cases: a.b,c;d:e",
253+
"Empty string:",
254+
"Only spaces: ",
255+
"Only symbols: !@#$%",
256+
"Only words: hello world",
257+
"Numbers: 123 456 789",
258+
"Underscores: hello_world test_case",
259+
"Special chars: αβγ δέζ ηθι",
260+
];
261+
262+
for test_case in test_cases {
263+
let mut original = PreTokenizedString::from(test_case);
264+
let mut optimized = PreTokenizedString::from(test_case);
265+
266+
let original_pretok = Whitespace {};
267+
let optimized_pretok = WhitespaceOptimized {};
268+
269+
original_pretok.pre_tokenize(&mut original).unwrap();
270+
optimized_pretok.pre_tokenize(&mut optimized).unwrap();
271+
272+
let original_splits = original
273+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
274+
.into_iter()
275+
.map(|(s, o, _)| (s, o))
276+
.collect::<Vec<_>>();
277+
278+
let optimized_splits = optimized
279+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
280+
.into_iter()
281+
.map(|(s, o, _)| (s, o))
282+
.collect::<Vec<_>>();
283+
284+
assert_eq!(
285+
original_splits, optimized_splits,
286+
"Mismatch for test case: '{}'",
287+
test_case
288+
);
289+
}
290+
}
291+
292+
#[test]
293+
fn optimized_edge_cases() {
294+
let pretok = WhitespaceOptimized {};
295+
296+
// Test various edge cases
297+
let edge_cases = vec![
298+
("", vec![]),
299+
(" ", vec![]),
300+
(" ", vec![]),
301+
("a", vec![("a", (0, 1))]),
302+
("!", vec![("!", (0, 1))]),
303+
("a!", vec![("a", (0, 1)), ("!", (1, 2))]),
304+
("!a", vec![("!", (0, 1)), ("a", (1, 2))]),
305+
("a b", vec![("a", (0, 1)), ("b", (2, 3))]),
306+
("a b", vec![("a", (0, 1)), ("b", (3, 4))]),
307+
("a\tb", vec![("a", (0, 1)), ("b", (2, 3))]),
308+
("a\nb", vec![("a", (0, 1)), ("b", (2, 3))]),
309+
("a\r\nb", vec![("a", (0, 1)), ("b", (3, 4))]),
310+
];
311+
312+
for (input, expected) in edge_cases {
313+
let mut pretokenized = PreTokenizedString::from(input);
314+
pretok.pre_tokenize(&mut pretokenized).unwrap();
315+
let result = pretokenized
316+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
317+
.into_iter()
318+
.map(|(s, o, _)| (s, o))
319+
.collect::<Vec<_>>();
320+
assert_eq!(result, expected, "Failed for input: '{}'", input);
321+
}
322+
}
105323
}

0 commit comments

Comments
 (0)