@@ -2,42 +2,48 @@ use std::sync::LazyLock;
2
2
3
3
use bpe:: byte_pair_encoding:: BytePairEncoding ;
4
4
use either:: Either ;
5
- use fancy_regex :: Regex ;
5
+ use regex_automata :: { meta :: Regex , util :: captures :: Captures , Anchored , Input } ;
6
6
7
7
static BPE_R50K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
8
8
let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_r50k_base.dict" ) ) ;
9
9
let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
10
- let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+|\\ s+(?! \\ S)| \\ s+ " ;
10
+ let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+|\\ s+" ;
11
11
Tokenizer :: new ( bpe, Some ( pat) ) . expect ( "valid regex" )
12
12
} ) ;
13
13
14
14
static BPE_P50K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
15
15
let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_p50k_base.dict" ) ) ;
16
16
let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
17
- let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+|\\ s+(?!\\ S)|\\ s+" ;
18
- Tokenizer :: new ( bpe, Some ( pat) ) . expect ( "valid regex" )
17
+ let pat1 = "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+" ;
18
+ let pat2 = "\\ s+\\ s" ;
19
+ let pat3 = "\\ s+" ;
20
+ Tokenizer :: with_many ( bpe, & [ pat1, pat2, pat3] ) . expect ( "valid regex" )
19
21
} ) ;
20
22
21
23
static BPE_CL100K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
22
24
let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_cl100k_base.dict" ) ) ;
23
25
let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
24
- let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+(?!\\ S)|\\ s+" ;
25
- Tokenizer :: new ( bpe, Some ( pat) ) . expect ( "valid regex" )
26
+ let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+" ;
27
+ // Note: Rewrite the negative look-ahead with a positive pseudo look-ahead.
28
+ // The look-ahead character is dropped from the match by the SpecialRegexp iterator.
29
+ let pat2 = "\\ s+\\ s" ;
30
+ let pat3 = "\\ s+" ;
31
+ Tokenizer :: with_many ( bpe, & [ pat1, pat2, pat3] ) . expect ( "valid regex" )
26
32
} ) ;
27
33
28
34
static BPE_O200K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
29
35
let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_o200k_base.dict" ) ) ;
30
36
let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
31
- let pat = [
37
+ let pat1 = [
32
38
"[^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]*[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?" ,
33
39
"[^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]+[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?" ,
34
40
"\\ p{N}{1,3}" ,
35
41
" ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n/]*" ,
36
42
"\\ s*[\\ r\\ n]+" ,
37
- "\\ s+(?!\\ S)" ,
38
- "\\ s+" ,
39
43
] . join ( "|" ) ;
40
- Tokenizer :: new ( bpe, Some ( & pat) ) . expect ( "valid regex" )
44
+ let pat2 = "\\ s+\\ s" ;
45
+ let pat3 = "\\ s+" ;
46
+ Tokenizer :: with_many ( bpe, & [ pat1. as_str ( ) , pat2, pat3] ) . expect ( "valid regex" )
41
47
} ) ;
42
48
43
49
pub use bpe:: * ;
@@ -57,8 +63,15 @@ pub struct Tokenizer {
57
63
58
64
impl Tokenizer {
59
65
#[ allow( clippy:: result_large_err) ]
60
- pub fn new ( bpe : BytePairEncoding , pat : Option < & str > ) -> fancy_regex:: Result < Self > {
61
- let pat = pat. map ( fancy_regex:: Regex :: new) . transpose ( ) ?;
66
+ pub fn new ( bpe : BytePairEncoding , pat : Option < & str > ) -> Result < Self , ( ) > {
67
+ let pat = pat. map ( Regex :: new) . transpose ( ) . map_err ( |_| ( ) ) ?;
68
+ Ok ( Self { bpe, pat } )
69
+ }
70
+
71
+ /// When using multiple patterns, the second pattern is assumed to be a look-ahead pattern with
72
+ /// exactly one look-ahead character!
73
+ pub fn with_many ( bpe : BytePairEncoding , patterns : & [ & str ] ) -> Result < Self , ( ) > {
74
+ let pat = Some ( Regex :: new_many ( patterns) . map_err ( |_| ( ) ) ?) ;
62
75
Ok ( Self { bpe, pat } )
63
76
}
64
77
@@ -78,16 +91,51 @@ impl Tokenizer {
78
91
String :: from_utf8 ( self . bpe . decode_tokens ( tokens) ) . ok ( )
79
92
}
80
93
81
- pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
94
+ pub fn split < ' a > ( & ' a self , input : & ' a str ) -> impl Iterator < Item = & str > + ' a {
82
95
match & self . pat {
83
- Some ( pat) => Either :: Left ( pat. find_iter ( text) . scan ( 0 , |start, m| {
84
- let m = m. expect ( "match succeeded" ) ;
85
- assert_eq ! ( * start, m. start( ) , "pattern should match all input text" ) ;
86
- * start = m. end ( ) ;
87
- Some ( m. as_str ( ) )
88
- } ) ) ,
89
- None => Either :: Right ( std:: iter:: once ( text) ) ,
96
+ Some ( pat) => Either :: Left ( SpecialRegexp {
97
+ pat,
98
+ input,
99
+ last : 0 ,
100
+ caps : Captures :: matches ( pat. group_info ( ) . clone ( ) ) ,
101
+ } ) ,
102
+ None => Either :: Right ( std:: iter:: once ( input) ) ,
103
+ }
104
+ }
105
+ }
106
+
107
+ /// This is a small wrapper around the regex which emulates the behaviour of look-ahead by
108
+ /// dropping the look-ahead character from the match. The assumption here is that the
109
+ /// second pattern is always a look-ahead pattern, and that just a single character needs
110
+ /// to be dropped. With this little hack, we can keep most of the regex patterns as they are,
111
+ /// but achieve a >3x speedup.
112
+ ///
113
+ /// Alternatively, this could have been implemented with capture groups, but those were ~30%
114
+ /// slower than this approach with multiple patterns.
115
+ struct SpecialRegexp < ' a > {
116
+ pat : & ' a Regex ,
117
+ input : & ' a str ,
118
+ last : usize ,
119
+ caps : Captures ,
120
+ }
121
+
122
+ impl < ' a > Iterator for SpecialRegexp < ' a > {
123
+ type Item = & ' a str ;
124
+
125
+ fn next ( & mut self ) -> Option < Self :: Item > {
126
+ let input = Input :: new ( & self . input [ self . last ..] ) . anchored ( Anchored :: Yes ) ;
127
+ self . caps . clear ( ) ;
128
+ self . pat . captures ( input, & mut self . caps ) ;
129
+ let m = self . caps . get_match ( ) ?;
130
+ let start = self . last ;
131
+ let mut end = self . last + m. range ( ) . end ;
132
+ if m. pattern ( ) == 1 . into ( ) {
133
+ let last = self . input [ start..end] . chars ( ) . rev ( ) . next ( ) . unwrap ( ) ;
134
+ end -= last. len_utf8 ( ) ;
135
+ assert_ne ! ( end, start, "a look-ahead pattern must ALWAYS consume at least one character excluding the look-ahead character!" ) ;
90
136
}
137
+ self . last = end;
138
+ Some ( & self . input [ start..end] )
91
139
}
92
140
}
93
141
0 commit comments