@@ -80,12 +80,17 @@ impl Tokenizer {
80
80
Ok ( Self { bpe, pre } )
81
81
}
82
82
83
+ /// Count the number of tokens produced when encoding the text. Applies pre-tokenization
84
+ /// before counting.
83
85
pub fn count ( & self , text : & str ) -> usize {
84
86
self . split ( text)
85
87
. map ( |piece| self . bpe . count ( piece. as_bytes ( ) ) )
86
88
. sum ( )
87
89
}
88
90
91
+ /// Returns the token count iff the total token count stays below the specified token_limit.
92
+ /// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
93
+ /// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
89
94
pub fn count_till_limit ( & self , text : & str , token_limit : usize ) -> Option < usize > {
90
95
self . split ( text)
91
96
. try_fold ( token_limit, |token_limit, piece| {
@@ -95,16 +100,21 @@ impl Tokenizer {
95
100
} )
96
101
}
97
102
103
+ /// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
104
+ /// encoding.
98
105
pub fn encode ( & self , text : & str ) -> Vec < u32 > {
99
106
self . split ( text)
100
107
. flat_map ( |piece| self . bpe . encode_via_backtracking ( piece. as_bytes ( ) ) )
101
108
. collect ( )
102
109
}
103
-
110
+ /// Returns the text corresponding to the given encoding if it is valid UTF-8. Otherwise,
111
+ /// returns none.
104
112
pub fn decode ( & self , tokens : & [ u32 ] ) -> Option < String > {
105
113
String :: from_utf8 ( self . bpe . decode_tokens ( tokens) ) . ok ( )
106
114
}
107
115
116
+ /// Returns an iterator with the text pieces resulting from pre-tokenization. If this
117
+ /// tokenizer does not have pre-tokenization, the iterator returns the full text.
108
118
pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
109
119
match & self . pre {
110
120
Some ( pre) => Either :: Left ( pre. split ( text) ) ,
@@ -133,6 +143,7 @@ impl Pretokenizer {
133
143
Ok ( Self { pat, lookahead } )
134
144
}
135
145
146
+ /// Returns an iterator with the text pieces after splitting with the regular expression.
136
147
pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
137
148
Splits {
138
149
pat : & self . pat ,
0 commit comments