Skip to content

Commit 5b46173

Browse files
authored
Revert "Fix tokenizer special token handling (#67)" (#72)
This reverts commit 9ceef56.
1 parent bdda7c8 commit 5b46173

File tree

3 files changed

+16
-27
lines changed

3 files changed

+16
-27
lines changed

include/pytorch/tokenizers/bpe_tokenizer_base.h

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@
2525
#include <pytorch/tokenizers/string_integer_map.h>
2626
#include <pytorch/tokenizers/tokenizer.h>
2727

28-
#include "re2/re2.h"
29-
3028
namespace tokenizers {
3129
namespace detail {
3230

@@ -106,25 +104,6 @@ static Result<TokenMap> buildTokenMap(
106104
return buildTokenMap(std::move(pairs));
107105
}
108106

109-
static Result<std::unique_ptr<IRegex>> build_special_token_regex(
110-
const TokenMap& special_token_map) {
111-
std::string special_pattern;
112-
const std::size_t count = special_token_map.size();
113-
114-
for (std::size_t i = 0; i < count; ++i) {
115-
const auto& [token, _] = special_token_map.getElement(i);
116-
if (!special_pattern.empty()) {
117-
special_pattern += "|";
118-
}
119-
special_pattern += re2::RE2::QuoteMeta(std::string(token));
120-
}
121-
122-
if (special_pattern.empty()) {
123-
return static_cast<std::unique_ptr<IRegex>>(nullptr);
124-
}
125-
return create_regex(special_pattern);
126-
}
127-
128107
class BPETokenizerBase : public Tokenizer {
129108
public:
130109
Result<std::vector<uint64_t>>

src/hf_tokenizer.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,6 @@ Error HFTokenizer::load(const std::string& path) {
6969
special_tokens,
7070
[](const auto& it) -> std::string { return it.at("content"); },
7171
[](const auto& it) -> std::uint64_t { return it.at("id"); }));
72-
73-
// Create special token regex to help later with encoding.
74-
special_token_regex_ = TK_UNWRAP(detail::build_special_token_regex(special_token_map));
75-
76-
// Store for future use.
7772
special_token_map_.emplace(std::move(special_token_map));
7873
} catch (const json::out_of_range& e) {
7974
fprintf(stderr, "Could not parse special tokens: %s\n", e.what());

src/tiktoken.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <fstream>
3333
#include <limits>
3434
#include <unordered_set>
35+
#include "re2/re2.h"
3536

3637
namespace tokenizers {
3738

@@ -46,6 +47,20 @@ static Result<std::unique_ptr<IRegex>> _create_regex(
4647
return create_regex(pattern);
4748
}
4849

50+
static Result<std::unique_ptr<IRegex>> _build_special_token_regex(
51+
const std::vector<std::pair<std::string, std::uint64_t>>& special_encoder) {
52+
std::string special_pattern;
53+
for (const auto& ele : special_encoder) {
54+
if (!special_pattern.empty()) {
55+
special_pattern += "|";
56+
}
57+
special_pattern += re2::RE2::QuoteMeta(ele.first);
58+
}
59+
if (special_pattern.empty()) {
60+
return static_cast<std::unique_ptr<IRegex>>(nullptr);
61+
}
62+
return _create_regex(special_pattern);
63+
}
4964

5065
static Result<std::pair<std::string, uint64_t>> _parse(
5166
const std::string& line) {
@@ -138,7 +153,7 @@ Error Tiktoken::load(const std::string& path) {
138153

139154
_regex = TK_UNWRAP(_create_regex(_pattern));
140155
special_token_regex_ =
141-
TK_UNWRAP(detail::build_special_token_regex(TokenMap(special_token_map)));
156+
TK_UNWRAP(_build_special_token_regex(special_token_map));
142157

143158
// initialize vocab_size, bos_tok, eos_tok
144159
vocab_size_ = token_map_->size() + special_token_map_->size();

0 commit comments

Comments
 (0)