Sync codebase

hauntsaninja · hauntsaninja · commit 9d01e5670ff5 · 2024-05-13T10:09:10.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.7.0]
+- Support for `gpt-4o`
+- Performance improvements
+
 ## [v0.6.0]
 - Optimise regular expressions for a 20% performance improvement, thanks to @paplorinc!
 - Add `text-embedding-3-*` models to `encoding_for_model`
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.6.0"
+version = "0.7.0"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "tiktoken"
-version = "0.6.0"
+version = "0.7.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
 license = {file = "LICENSE"}
@@ -42,4 +42,3 @@ test-command = "pytest {project}/tests --import-mode=append"
 [[tool.cibuildwheel.overrides]]
 select = "*linux_aarch64"
 test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'"""
-
diff --git a/tiktoken/model.py b/tiktoken/model.py
@@ -6,6 +6,7 @@
 # TODO: these will likely be replaced by an API endpoint
 MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     # chat
+    "gpt-4o-": "o200k_base",  # e.g., gpt-4o-2024-05-13
     "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
     "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
     "gpt-35-turbo-": "cl100k_base",  # Azure deployment name
@@ -18,6 +19,7 @@
 
 MODEL_TO_ENCODING: dict[str, str] = {
     # chat
+    "gpt-4o": "o200k_base",
     "gpt-4": "cl100k_base",
     "gpt-3.5-turbo": "cl100k_base",
     "gpt-3.5": "cl100k_base",  # Common shorthand
diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py
@@ -88,10 +88,40 @@ def cl100k_base():
     }
 
 
+def o200k_base():
+    mergeable_ranks = load_tiktoken_bpe(
+        "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
+        expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",
+    )
+    special_tokens = {
+        ENDOFTEXT: 199999,
+        ENDOFPROMPT: 200018,
+    }
+    # This regex could be made more efficient
+    pat_str = "|".join(
+        [
+            r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
+            r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
+            r"""\p{N}{1,3}""",
+            r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
+            r"""\s*[\r\n]+""",
+            r"""\s+(?!\S)""",
+            r"""\s+""",
+        ]
+    )
+    return {
+        "name": "o200k_base",
+        "pat_str": pat_str,
+        "mergeable_ranks": mergeable_ranks,
+        "special_tokens": special_tokens,
+    }
+
+
 ENCODING_CONSTRUCTORS = {
     "gpt2": gpt2,
     "r50k_base": r50k_base,
     "p50k_base": p50k_base,
     "p50k_edit": p50k_edit,
     "cl100k_base": cl100k_base,
+    "o200k_base": o200k_base,
 }