@@ -16,12 +16,10 @@ static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
16
16
let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_cl100k_base.dict" ) ) ;
17
17
let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
18
18
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+$" ;
19
- // Note: Rewrite the negative look-ahead with a positive pseudo look-ahead.
20
- // The look-ahead character is dropped from the match by the SpecialRegexp iterator.
21
- // Note: The negative look-ahead requires also the pattern `\\s+$` to handle end of file without dropping a character!
22
19
let pat2 = "\\ s+\\ s" ;
23
20
let pat3 = "\\ s+" ;
24
- Tokenizer :: with_many ( bpe, & [ pat1, pat2, pat3] ) . expect ( "valid regex" )
21
+ Tokenizer :: new_lookahead ( bpe, & [ ( pat1, false ) , ( pat2, true ) , ( pat3, false ) ] )
22
+ . expect ( "valid regex" )
25
23
} ) ;
26
24
27
25
static BPE_O200K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
@@ -37,7 +35,8 @@ static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
37
35
] . join ( "|" ) ;
38
36
let pat2 = "\\ s+\\ s" ;
39
37
let pat3 = "\\ s+" ;
40
- Tokenizer :: with_many ( bpe, & [ pat1. as_str ( ) , pat2, pat3] ) . expect ( "valid regex" )
38
+ Tokenizer :: new_lookahead ( bpe, & [ ( & pat1, false ) , ( pat2, true ) , ( pat3, false ) ] )
39
+ . expect ( "valid regex" )
41
40
} ) ;
42
41
43
42
pub use bpe:: * ;
@@ -52,22 +51,33 @@ pub struct Tokenizer {
52
51
/// The byte-pair encoding for this tokenizer.
53
52
pub bpe : BytePairEncoding ,
54
53
/// The pattern regex used to split the input.
55
- pub pat : Option < Regex > ,
54
+ pub pre : Option < Pretokenizer > ,
55
+ }
56
+
57
+ pub struct Pretokenizer {
58
+ /// The pattern regex used to split the input.
59
+ pat : Regex ,
60
+ /// For each pattern in the regex a boolean whether the last character is a look-ahead.
61
+ lookahead : Vec < bool > ,
56
62
}
57
63
58
64
impl Tokenizer {
59
65
/// Build a tokenizer with an optional pretokenization regex pattern.
60
66
#[ allow( clippy:: result_large_err) ]
61
67
pub fn new ( bpe : BytePairEncoding , pat : Option < & str > ) -> Result < Self , BuildError > {
62
- let pat = pat. map ( Regex :: new) . transpose ( ) ?;
63
- Ok ( Self { bpe, pat } )
68
+ let pre = pat. map ( Pretokenizer :: new) . transpose ( ) ?;
69
+ Ok ( Self { bpe, pre } )
64
70
}
65
71
66
- /// When using multiple patterns, the second pattern is assumed to be a look-ahead pattern with
67
- /// exactly one look-ahead character!
68
- pub fn with_many ( bpe : BytePairEncoding , patterns : & [ & str ] ) -> Result < Self , BuildError > {
69
- let pat = Some ( Regex :: new_many ( patterns) ?) ;
70
- Ok ( Self { bpe, pat } )
72
+ /// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
73
+ /// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
74
+ #[ allow( clippy:: result_large_err) ]
75
+ pub fn new_lookahead (
76
+ bpe : BytePairEncoding ,
77
+ patterns : & [ ( & str , bool ) ] ,
78
+ ) -> Result < Self , BuildError > {
79
+ let pre = Some ( Pretokenizer :: new_lookahead ( patterns) ?) ;
80
+ Ok ( Self { bpe, pre } )
71
81
}
72
82
73
83
pub fn count ( & self , text : & str ) -> usize {
@@ -86,15 +96,41 @@ impl Tokenizer {
86
96
String :: from_utf8 ( self . bpe . decode_tokens ( tokens) ) . ok ( )
87
97
}
88
98
89
- pub fn split < ' a > ( & ' a self , input : & ' a str ) -> impl Iterator < Item = & str > + ' a {
90
- match & self . pat {
91
- Some ( pat) => Either :: Left ( SpecialRegexp {
92
- pat,
93
- input,
94
- last : 0 ,
95
- caps : Captures :: matches ( pat. group_info ( ) . clone ( ) ) ,
96
- } ) ,
97
- None => Either :: Right ( std:: iter:: once ( input) ) ,
99
+ pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
100
+ match & self . pre {
101
+ Some ( pre) => Either :: Left ( pre. split ( text) ) ,
102
+ None => Either :: Right ( std:: iter:: once ( text) ) ,
103
+ }
104
+ }
105
+ }
106
+
107
+ impl Pretokenizer {
108
+ /// Build a pretokenizer from the given regex pattern.
109
+ #[ allow( clippy:: result_large_err) ]
110
+ fn new ( pat : & str ) -> Result < Self , BuildError > {
111
+ let pat = Regex :: new ( pat) ?;
112
+ Ok ( Self {
113
+ pat,
114
+ lookahead : vec ! [ false ] ,
115
+ } )
116
+ }
117
+
118
+ /// Build a pretokenizer from the given regex patterns. If the boolean for a pattern is true,
119
+ /// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
120
+ #[ allow( clippy:: result_large_err) ]
121
+ fn new_lookahead ( pats : & [ ( & str , bool ) ] ) -> Result < Self , BuildError > {
122
+ let ( pats, lookahead) : ( Vec < _ > , _ ) = pats. iter ( ) . copied ( ) . unzip ( ) ;
123
+ let pat = Regex :: new_many ( & pats) ?;
124
+ Ok ( Self { pat, lookahead } )
125
+ }
126
+
127
+ pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
128
+ Splits {
129
+ pat : & self . pat ,
130
+ lookahead : & self . lookahead ,
131
+ text,
132
+ last : 0 ,
133
+ caps : Captures :: matches ( self . pat . group_info ( ) . clone ( ) ) ,
98
134
}
99
135
}
100
136
}
@@ -107,33 +143,34 @@ impl Tokenizer {
107
143
///
108
144
/// Alternatively, this could have been implemented with capture groups, but those were ~30%
109
145
/// slower than this approach with multiple patterns.
110
- struct SpecialRegexp < ' a > {
146
+ struct Splits < ' a > {
111
147
pat : & ' a Regex ,
112
- input : & ' a str ,
148
+ lookahead : & ' a [ bool ] ,
149
+ text : & ' a str ,
113
150
last : usize ,
114
151
caps : Captures ,
115
152
}
116
153
117
- impl < ' a > Iterator for SpecialRegexp < ' a > {
154
+ impl < ' a > Iterator for Splits < ' a > {
118
155
type Item = & ' a str ;
119
156
120
157
fn next ( & mut self ) -> Option < Self :: Item > {
121
- let input = Input :: new ( & self . input [ self . last ..] ) . anchored ( Anchored :: Yes ) ;
158
+ let input = Input :: new ( & self . text [ self . last ..] ) . anchored ( Anchored :: Yes ) ;
122
159
self . caps . clear ( ) ;
123
160
self . pat . captures ( input, & mut self . caps ) ;
124
161
let m = self . caps . get_match ( ) ?;
125
162
let start = self . last ;
126
163
let mut end = self . last + m. range ( ) . end ;
127
- if m. pattern ( ) == 1 . into ( ) {
128
- let last = self . input [ start..end]
164
+ if self . lookahead [ m. pattern ( ) . as_usize ( ) ] {
165
+ let last = self . text [ start..end]
129
166
. chars ( )
130
167
. next_back ( )
131
168
. expect ( "Expected at least a look-ahead character!" ) ;
132
169
end -= last. len_utf8 ( ) ;
133
170
assert_ne ! ( end, start, "a look-ahead pattern must ALWAYS consume at least one character excluding the look-ahead character!" ) ;
134
171
}
135
172
self . last = end;
136
- Some ( & self . input [ start..end] )
173
+ Some ( & self . text [ start..end] )
137
174
}
138
175
}
139
176
0 commit comments