Revert "Fix tokenizer special token handling (#67)" (#72)

jackzhxng · web-flow · commit 5b4617333f5d · 2025-05-13T16:50:51.000-07:00
This reverts commit 9ceef56.
diff --git a/include/pytorch/tokenizers/bpe_tokenizer_base.h b/include/pytorch/tokenizers/bpe_tokenizer_base.h
@@ -25,8 +25,6 @@
 #include <pytorch/tokenizers/string_integer_map.h>
 #include <pytorch/tokenizers/tokenizer.h>
 
-#include "re2/re2.h"
-
 namespace tokenizers {
 namespace detail {
 
@@ -106,25 +104,6 @@ static Result<TokenMap> buildTokenMap(
   return buildTokenMap(std::move(pairs));
 }
 
-static Result<std::unique_ptr<IRegex>> build_special_token_regex(
-    const TokenMap& special_token_map) {
-  std::string special_pattern;
-  const std::size_t count = special_token_map.size();
-
-  for (std::size_t i = 0; i < count; ++i) {
-    const auto& [token, _] = special_token_map.getElement(i);
-    if (!special_pattern.empty()) {
-      special_pattern += "|";
-    }
-    special_pattern += re2::RE2::QuoteMeta(std::string(token));
-  }
-
-  if (special_pattern.empty()) {
-    return static_cast<std::unique_ptr<IRegex>>(nullptr);
-  }
-  return create_regex(special_pattern);
-}
-
 class BPETokenizerBase : public Tokenizer {
  public:
   Result<std::vector<uint64_t>>
diff --git a/src/hf_tokenizer.cpp b/src/hf_tokenizer.cpp
@@ -69,11 +69,6 @@ Error HFTokenizer::load(const std::string& path) {
         special_tokens,
         [](const auto& it) -> std::string { return it.at("content"); },
         [](const auto& it) -> std::uint64_t { return it.at("id"); }));
-
-    // Create special token regex to help later with encoding.
-    special_token_regex_ = TK_UNWRAP(detail::build_special_token_regex(special_token_map));
-
-    // Store for future use.
     special_token_map_.emplace(std::move(special_token_map));
   } catch (const json::out_of_range& e) {
     fprintf(stderr, "Could not parse special tokens: %s\n", e.what());
diff --git a/src/tiktoken.cpp b/src/tiktoken.cpp
@@ -32,6 +32,7 @@
 #include <fstream>
 #include <limits>
 #include <unordered_set>
+#include "re2/re2.h"
 
 namespace tokenizers {
 
@@ -46,6 +47,20 @@ static Result<std::unique_ptr<IRegex>> _create_regex(
   return create_regex(pattern);
 }
 
+static Result<std::unique_ptr<IRegex>> _build_special_token_regex(
+    const std::vector<std::pair<std::string, std::uint64_t>>& special_encoder) {
+  std::string special_pattern;
+  for (const auto& ele : special_encoder) {
+    if (!special_pattern.empty()) {
+      special_pattern += "|";
+    }
+    special_pattern += re2::RE2::QuoteMeta(ele.first);
+  }
+  if (special_pattern.empty()) {
+    return static_cast<std::unique_ptr<IRegex>>(nullptr);
+  }
+  return _create_regex(special_pattern);
+}
 
 static Result<std::pair<std::string, uint64_t>> _parse(
     const std::string& line) {
@@ -138,7 +153,7 @@ Error Tiktoken::load(const std::string& path) {
 
   _regex = TK_UNWRAP(_create_regex(_pattern));
   special_token_regex_ =
-    TK_UNWRAP(detail::build_special_token_regex(TokenMap(special_token_map)));
+      TK_UNWRAP(_build_special_token_regex(special_token_map));
 
   // initialize vocab_size, bos_tok, eos_tok
   vocab_size_ = token_map_->size() + special_token_map_->size();