@@ -28,6 +28,141 @@ impl PreTokenizer for Whitespace {
2828 }
2929}
3030
31+ /// Optimized whitespace pre-tokenizer that uses byte-level scanning instead of regex.
32+ /// This provides better performance but may have slightly different behavior in edge cases
33+ /// compared to the regex-based implementation.
34+ #[ derive( Clone , Debug , PartialEq , Eq ) ]
35+ #[ macro_rules_attribute( impl_serde_type!) ]
36+ pub struct WhitespaceOptimized ;
37+
38+ impl Default for WhitespaceOptimized {
39+ fn default ( ) -> Self {
40+ Self
41+ }
42+ }
43+
44+ impl PreTokenizer for WhitespaceOptimized {
45+ fn pre_tokenize ( & self , pretokenized : & mut PreTokenizedString ) -> Result < ( ) > {
46+ pretokenized. split ( |_, normalized| {
47+ normalized. split ( Invert ( WhitespacePattern ) , SplitDelimiterBehavior :: Removed )
48+ } )
49+ }
50+ }
51+
52+ /// Custom pattern implementation for optimized whitespace splitting
53+ /// This implements the equivalent of the regex r"\w+|[^\w\s]+" but with manual byte scanning
54+ struct WhitespacePattern ;
55+
56+ impl crate :: tokenizer:: pattern:: Pattern for WhitespacePattern {
57+ fn find_matches ( & self , inside : & str ) -> Result < Vec < ( crate :: Offsets , bool ) > > {
58+ if inside. is_empty ( ) {
59+ return Ok ( vec ! [ ( ( 0 , 0 ) , false ) ] ) ;
60+ }
61+
62+ let mut matches = Vec :: new ( ) ;
63+ let mut current_start = 0 ;
64+ let mut current_end = 0 ;
65+ let mut current_type = None ; // None = whitespace, Some(true) = word, Some(false) = symbol
66+
67+ let mut i = 0 ;
68+ while i < inside. len ( ) {
69+ let char_start = inside[ i..] . chars ( ) . next ( ) . unwrap ( ) ;
70+ let char_len = char_start. len_utf8 ( ) ;
71+
72+ let is_whitespace = char_start. is_whitespace ( ) ;
73+ let is_word_char = char_start. is_alphanumeric ( ) || char_start == '_' ;
74+ let is_symbol = !is_whitespace && !is_word_char;
75+
76+ match ( current_type, is_whitespace, is_word_char, is_symbol) {
77+ ( None , true , _, _) => {
78+ // Continue in whitespace
79+ i += char_len;
80+ }
81+ ( None , false , true , _) => {
82+ // Transition from whitespace to word
83+ current_start = i;
84+ current_end = i + char_len;
85+ current_type = Some ( true ) ;
86+ i += char_len;
87+ }
88+ ( None , false , false , true ) => {
89+ // Transition from whitespace to symbol
90+ current_start = i;
91+ current_end = i + char_len;
92+ current_type = Some ( false ) ;
93+ i += char_len;
94+ }
95+ ( None , false , false , false ) => {
96+ // This shouldn't happen since a char is either whitespace, word, or symbol
97+ // But handle it gracefully by treating as symbol
98+ current_start = i;
99+ current_end = i + char_len;
100+ current_type = Some ( false ) ;
101+ i += char_len;
102+ }
103+ ( Some ( true ) , true , _, _) => {
104+ // Transition from word to whitespace - finish word
105+ matches. push ( ( ( current_start, current_end) , true ) ) ;
106+ current_type = None ;
107+ i += char_len;
108+ }
109+ ( Some ( true ) , false , true , _) => {
110+ // Continue in word
111+ current_end = i + char_len;
112+ i += char_len;
113+ }
114+ ( Some ( true ) , false , false , true ) => {
115+ // Transition from word to symbol - finish word, start symbol
116+ matches. push ( ( ( current_start, current_end) , true ) ) ;
117+ current_start = i;
118+ current_end = i + char_len;
119+ current_type = Some ( false ) ;
120+ i += char_len;
121+ }
122+ ( Some ( true ) , false , false , false ) => {
123+ // This shouldn't happen, but handle as symbol
124+ matches. push ( ( ( current_start, current_end) , true ) ) ;
125+ current_start = i;
126+ current_end = i + char_len;
127+ current_type = Some ( false ) ;
128+ i += char_len;
129+ }
130+ ( Some ( false ) , true , _, _) => {
131+ // Transition from symbol to whitespace - finish symbol
132+ matches. push ( ( ( current_start, current_end) , true ) ) ;
133+ current_type = None ;
134+ i += char_len;
135+ }
136+ ( Some ( false ) , false , true , _) => {
137+ // Transition from symbol to word - finish symbol, start word
138+ matches. push ( ( ( current_start, current_end) , true ) ) ;
139+ current_start = i;
140+ current_end = i + char_len;
141+ current_type = Some ( true ) ;
142+ i += char_len;
143+ }
144+ ( Some ( false ) , false , false , true ) => {
145+ // Continue in symbol
146+ current_end = i + char_len;
147+ i += char_len;
148+ }
149+ ( Some ( false ) , false , false , false ) => {
150+ // This shouldn't happen, but handle as symbol
151+ current_end = i + char_len;
152+ i += char_len;
153+ }
154+ }
155+ }
156+
157+ // Don't forget the last token
158+ if let Some ( _) = current_type {
159+ matches. push ( ( ( current_start, current_end) , true ) ) ;
160+ }
161+
162+ Ok ( matches)
163+ }
164+ }
165+
31166#[ derive( Copy , Clone , Debug , PartialEq , Eq ) ]
32167#[ macro_rules_attribute( impl_serde_type!) ]
33168pub struct WhitespaceSplit ;
@@ -102,4 +237,87 @@ mod tests {
102237 ) ;
103238 }
104239 }
240+
241+ #[ test]
242+ fn optimized_compatibility ( ) {
243+ // Test that the optimized version produces the same results as the original
244+ let test_cases = vec ! [
245+ "Hello world!" ,
246+ "How are you doing?" ,
247+ "This is a test with numbers 123 and symbols @#$%" ,
248+ "Multiple spaces" ,
249+ "Tabs\t and\n newlines" ,
250+ "Unicode: café résumé naïve" ,
251+ "Mixed: Hello123!@# world" ,
252+ "Edge cases: a.b,c;d:e" ,
253+ "Empty string:" ,
254+ "Only spaces: " ,
255+ "Only symbols: !@#$%" ,
256+ "Only words: hello world" ,
257+ "Numbers: 123 456 789" ,
258+ "Underscores: hello_world test_case" ,
259+ "Special chars: αβγ δέζ ηθι" ,
260+ ] ;
261+
262+ for test_case in test_cases {
263+ let mut original = PreTokenizedString :: from ( test_case) ;
264+ let mut optimized = PreTokenizedString :: from ( test_case) ;
265+
266+ let original_pretok = Whitespace { } ;
267+ let optimized_pretok = WhitespaceOptimized { } ;
268+
269+ original_pretok. pre_tokenize ( & mut original) . unwrap ( ) ;
270+ optimized_pretok. pre_tokenize ( & mut optimized) . unwrap ( ) ;
271+
272+ let original_splits = original
273+ . get_splits ( OffsetReferential :: Original , OffsetType :: Byte )
274+ . into_iter ( )
275+ . map ( |( s, o, _) | ( s, o) )
276+ . collect :: < Vec < _ > > ( ) ;
277+
278+ let optimized_splits = optimized
279+ . get_splits ( OffsetReferential :: Original , OffsetType :: Byte )
280+ . into_iter ( )
281+ . map ( |( s, o, _) | ( s, o) )
282+ . collect :: < Vec < _ > > ( ) ;
283+
284+ assert_eq ! (
285+ original_splits, optimized_splits,
286+ "Mismatch for test case: '{}'" ,
287+ test_case
288+ ) ;
289+ }
290+ }
291+
292+ #[ test]
293+ fn optimized_edge_cases ( ) {
294+ let pretok = WhitespaceOptimized { } ;
295+
296+ // Test various edge cases
297+ let edge_cases = vec ! [
298+ ( "" , vec![ ] ) ,
299+ ( " " , vec![ ] ) ,
300+ ( " " , vec![ ] ) ,
301+ ( "a" , vec![ ( "a" , ( 0 , 1 ) ) ] ) ,
302+ ( "!" , vec![ ( "!" , ( 0 , 1 ) ) ] ) ,
303+ ( "a!" , vec![ ( "a" , ( 0 , 1 ) ) , ( "!" , ( 1 , 2 ) ) ] ) ,
304+ ( "!a" , vec![ ( "!" , ( 0 , 1 ) ) , ( "a" , ( 1 , 2 ) ) ] ) ,
305+ ( "a b" , vec![ ( "a" , ( 0 , 1 ) ) , ( "b" , ( 2 , 3 ) ) ] ) ,
306+ ( "a b" , vec![ ( "a" , ( 0 , 1 ) ) , ( "b" , ( 3 , 4 ) ) ] ) ,
307+ ( "a\t b" , vec![ ( "a" , ( 0 , 1 ) ) , ( "b" , ( 2 , 3 ) ) ] ) ,
308+ ( "a\n b" , vec![ ( "a" , ( 0 , 1 ) ) , ( "b" , ( 2 , 3 ) ) ] ) ,
309+ ( "a\r \n b" , vec![ ( "a" , ( 0 , 1 ) ) , ( "b" , ( 3 , 4 ) ) ] ) ,
310+ ] ;
311+
312+ for ( input, expected) in edge_cases {
313+ let mut pretokenized = PreTokenizedString :: from ( input) ;
314+ pretok. pre_tokenize ( & mut pretokenized) . unwrap ( ) ;
315+ let result = pretokenized
316+ . get_splits ( OffsetReferential :: Original , OffsetType :: Byte )
317+ . into_iter ( )
318+ . map ( |( s, o, _) | ( s, o) )
319+ . collect :: < Vec < _ > > ( ) ;
320+ assert_eq ! ( result, expected, "Failed for input: '{}'" , input) ;
321+ }
322+ }
105323}
0 commit comments