8
8
9
9
// Local
10
10
#include < pytorch/tokenizers/pre_tokenizer.h>
11
- #include < pytorch/tokenizers/third-party/llama.cpp-unicode/ unicode.h>
11
+ #include < unicode.h>
12
12
13
13
// Standard
14
14
#include < algorithm>
@@ -63,37 +63,35 @@ PreTokenizer::Ptr PreTokenizerConfig::create() const {
63
63
" Missing pretokenizers for PreTokenizer of type Sequence" );
64
64
}
65
65
std::vector<PreTokenizer::Ptr > pretoks;
66
- std::transform (
67
- pretokenizers->begin (),
68
- pretokenizers->end (),
69
- std::back_inserter (pretoks),
70
- [](const PreTokenizerConfig& cfg) { return cfg.create (); });
66
+ std::transform (pretokenizers->begin (), pretokenizers->end (),
67
+ std::back_inserter (pretoks),
68
+ [](const PreTokenizerConfig &cfg) { return cfg.create (); });
71
69
return PreTokenizer::Ptr (new SequencePreTokenizer (pretoks));
72
70
}
73
71
throw std::runtime_error (" Unsupported PreTokenizer type: " + type);
74
72
}
75
73
76
- PreTokenizerConfig& PreTokenizerConfig::parse_json (const json& json_config) {
74
+ PreTokenizerConfig & PreTokenizerConfig::parse_json (const json & json_config) {
77
75
type = json_config.at (" type" );
78
76
if (type == " Split" ) {
79
77
try {
80
78
pattern = json_config.at (" pattern" );
81
- } catch (json::out_of_range&) {
79
+ } catch (json::out_of_range &) {
82
80
}
83
81
} else if (type == " Digits" ) {
84
82
try {
85
83
individual_digits = json_config.at (" individual_digits" );
86
- } catch (json::out_of_range&) {
84
+ } catch (json::out_of_range &) {
87
85
}
88
86
} else if (type == " ByteLevel" ) {
89
87
try {
90
88
add_prefix_space = json_config.at (" add_prefix_space" );
91
- } catch (json::out_of_range&) {
89
+ } catch (json::out_of_range &) {
92
90
}
93
91
// TODO: trim_offsets, use_regex
94
92
} else if (type == " Sequence" ) {
95
93
pretokenizers = std::vector<PreTokenizerConfig>();
96
- for (const auto & entry : json_config.at (" pretokenizers" )) {
94
+ for (const auto & entry : json_config.at (" pretokenizers" )) {
97
95
pretokenizers->push_back (PreTokenizerConfig ().parse_json (entry));
98
96
}
99
97
} else {
@@ -104,14 +102,14 @@ PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) {
104
102
105
103
// RegexPreTokenizer ///////////////////////////////////////////////////////////
106
104
107
- RegexPreTokenizer::Re2UPtr RegexPreTokenizer::create_regex_ (
108
- const std::string& pattern) {
105
+ RegexPreTokenizer::Re2UPtr
106
+ RegexPreTokenizer::create_regex_ ( const std::string & pattern) {
109
107
assert (!pattern.empty ());
110
108
return std::make_unique<re2::RE2>(" (" + pattern + " )" );
111
109
}
112
110
113
- std::vector<std::string> RegexPreTokenizer::pre_tokenize (
114
- re2::StringPiece input) const {
111
+ std::vector<std::string>
112
+ RegexPreTokenizer::pre_tokenize ( re2::StringPiece input) const {
115
113
std::vector<std::string> result;
116
114
std::string piece;
117
115
while (RE2::FindAndConsume (&input, *regex_, &piece)) {
@@ -138,14 +136,13 @@ constexpr char GPT2_EXPR[] =
138
136
// Construction //
139
137
// ////////////////
140
138
141
- ByteLevelPreTokenizer::ByteLevelPreTokenizer (
142
- bool add_prefix_space,
143
- const std::string& pattern)
139
+ ByteLevelPreTokenizer::ByteLevelPreTokenizer (bool add_prefix_space,
140
+ const std::string &pattern)
144
141
: pattern_(pattern.empty() ? GPT2_EXPR : pattern),
145
142
add_prefix_space_ (add_prefix_space) {}
146
143
147
- std::vector<std::string> ByteLevelPreTokenizer::pre_tokenize (
148
- re2::StringPiece input) const {
144
+ std::vector<std::string>
145
+ ByteLevelPreTokenizer::pre_tokenize ( re2::StringPiece input) const {
149
146
// Add the prefix space if configured to do so
150
147
std::string input_str (input);
151
148
if (add_prefix_space_ && !input_str.empty () && input_str[0 ] != ' ' ) {
@@ -161,13 +158,13 @@ SequencePreTokenizer::SequencePreTokenizer(
161
158
std::vector<PreTokenizer::Ptr > pre_tokenizers)
162
159
: pre_tokenizers_(std::move(pre_tokenizers)) {}
163
160
164
- std::vector<std::string> SequencePreTokenizer::pre_tokenize (
165
- re2::StringPiece input) const {
161
+ std::vector<std::string>
162
+ SequencePreTokenizer::pre_tokenize ( re2::StringPiece input) const {
166
163
std::vector<std::string> pieces{std::string (input)};
167
- for (const auto & pre_tokenizer : pre_tokenizers_) {
164
+ for (const auto & pre_tokenizer : pre_tokenizers_) {
168
165
std::vector<std::string> new_pieces;
169
- for (const auto & piece : pieces) {
170
- for (const auto & subpiece : pre_tokenizer->pre_tokenize (piece)) {
166
+ for (const auto & piece : pieces) {
167
+ for (const auto & subpiece : pre_tokenizer->pre_tokenize (piece)) {
171
168
new_pieces.push_back (subpiece);
172
169
}
173
170
}
0 commit comments