File tree 3 files changed +16
-27
lines changed
include/pytorch/tokenizers
3 files changed +16
-27
lines changed Original file line number Diff line number Diff line change 25
25
#include < pytorch/tokenizers/string_integer_map.h>
26
26
#include < pytorch/tokenizers/tokenizer.h>
27
27
28
- #include " re2/re2.h"
29
-
30
28
namespace tokenizers {
31
29
namespace detail {
32
30
@@ -106,25 +104,6 @@ static Result<TokenMap> buildTokenMap(
106
104
return buildTokenMap (std::move (pairs));
107
105
}
108
106
109
- static Result<std::unique_ptr<IRegex>> build_special_token_regex (
110
- const TokenMap& special_token_map) {
111
- std::string special_pattern;
112
- const std::size_t count = special_token_map.size ();
113
-
114
- for (std::size_t i = 0 ; i < count; ++i) {
115
- const auto & [token, _] = special_token_map.getElement (i);
116
- if (!special_pattern.empty ()) {
117
- special_pattern += " |" ;
118
- }
119
- special_pattern += re2::RE2::QuoteMeta (std::string (token));
120
- }
121
-
122
- if (special_pattern.empty ()) {
123
- return static_cast <std::unique_ptr<IRegex>>(nullptr );
124
- }
125
- return create_regex (special_pattern);
126
- }
127
-
128
107
class BPETokenizerBase : public Tokenizer {
129
108
public:
130
109
Result<std::vector<uint64_t >>
Original file line number Diff line number Diff line change @@ -69,11 +69,6 @@ Error HFTokenizer::load(const std::string& path) {
69
69
special_tokens,
70
70
[](const auto & it) -> std::string { return it.at (" content" ); },
71
71
[](const auto & it) -> std::uint64_t { return it.at (" id" ); }));
72
-
73
- // Create special token regex to help later with encoding.
74
- special_token_regex_ = TK_UNWRAP (detail::build_special_token_regex (special_token_map));
75
-
76
- // Store for future use.
77
72
special_token_map_.emplace (std::move (special_token_map));
78
73
} catch (const json::out_of_range& e) {
79
74
fprintf (stderr, " Could not parse special tokens: %s\n " , e.what ());
Original file line number Diff line number Diff line change 32
32
#include < fstream>
33
33
#include < limits>
34
34
#include < unordered_set>
35
+ #include " re2/re2.h"
35
36
36
37
namespace tokenizers {
37
38
@@ -46,6 +47,20 @@ static Result<std::unique_ptr<IRegex>> _create_regex(
46
47
return create_regex (pattern);
47
48
}
48
49
50
+ static Result<std::unique_ptr<IRegex>> _build_special_token_regex (
51
+ const std::vector<std::pair<std::string, std::uint64_t >>& special_encoder) {
52
+ std::string special_pattern;
53
+ for (const auto & ele : special_encoder) {
54
+ if (!special_pattern.empty ()) {
55
+ special_pattern += " |" ;
56
+ }
57
+ special_pattern += re2::RE2::QuoteMeta (ele.first );
58
+ }
59
+ if (special_pattern.empty ()) {
60
+ return static_cast <std::unique_ptr<IRegex>>(nullptr );
61
+ }
62
+ return _create_regex (special_pattern);
63
+ }
49
64
50
65
static Result<std::pair<std::string, uint64_t >> _parse (
51
66
const std::string& line) {
@@ -138,7 +153,7 @@ Error Tiktoken::load(const std::string& path) {
138
153
139
154
_regex = TK_UNWRAP (_create_regex (_pattern));
140
155
special_token_regex_ =
141
- TK_UNWRAP (detail::build_special_token_regex ( TokenMap ( special_token_map) ));
156
+ TK_UNWRAP (_build_special_token_regex ( special_token_map));
142
157
143
158
// initialize vocab_size, bos_tok, eos_tok
144
159
vocab_size_ = token_map_->size () + special_token_map_->size ();
You can’t perform that action at this time.
0 commit comments