Skip to content

Commit 9d01e56

Browse files
committed
Sync codebase
1 parent 1b9faf2 commit 9d01e56

File tree

5 files changed

+38
-3
lines changed

5 files changed

+38
-3
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
This is the changelog for the open source version of tiktoken.
44

5+
## [v0.7.0]
6+
- Support for `gpt-4o`
7+
- Performance improvements
8+
59
## [v0.6.0]
610
- Optimise regular expressions for a 20% performance improvement, thanks to @paplorinc!
711
- Add `text-embedding-3-*` models to `encoding_for_model`

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "tiktoken"
3-
version = "0.6.0"
3+
version = "0.7.0"
44
edition = "2021"
55
rust-version = "1.57.0"
66

pyproject.toml

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "tiktoken"
3-
version = "0.6.0"
3+
version = "0.7.0"
44
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
55
readme = "README.md"
66
license = {file = "LICENSE"}
@@ -42,4 +42,3 @@ test-command = "pytest {project}/tests --import-mode=append"
4242
[[tool.cibuildwheel.overrides]]
4343
select = "*linux_aarch64"
4444
test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'"""
45-

tiktoken/model.py

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# TODO: these will likely be replaced by an API endpoint
77
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
88
# chat
9+
"gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13
910
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
1011
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
1112
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
@@ -18,6 +19,7 @@
1819

1920
MODEL_TO_ENCODING: dict[str, str] = {
2021
# chat
22+
"gpt-4o": "o200k_base",
2123
"gpt-4": "cl100k_base",
2224
"gpt-3.5-turbo": "cl100k_base",
2325
"gpt-3.5": "cl100k_base", # Common shorthand

tiktoken_ext/openai_public.py

+30
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,40 @@ def cl100k_base():
8888
}
8989

9090

91+
def o200k_base():
92+
mergeable_ranks = load_tiktoken_bpe(
93+
"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
94+
expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",
95+
)
96+
special_tokens = {
97+
ENDOFTEXT: 199999,
98+
ENDOFPROMPT: 200018,
99+
}
100+
# This regex could be made more efficient
101+
pat_str = "|".join(
102+
[
103+
r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
104+
r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
105+
r"""\p{N}{1,3}""",
106+
r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
107+
r"""\s*[\r\n]+""",
108+
r"""\s+(?!\S)""",
109+
r"""\s+""",
110+
]
111+
)
112+
return {
113+
"name": "o200k_base",
114+
"pat_str": pat_str,
115+
"mergeable_ranks": mergeable_ranks,
116+
"special_tokens": special_tokens,
117+
}
118+
119+
91120
ENCODING_CONSTRUCTORS = {
92121
"gpt2": gpt2,
93122
"r50k_base": r50k_base,
94123
"p50k_base": p50k_base,
95124
"p50k_edit": p50k_edit,
96125
"cl100k_base": cl100k_base,
126+
"o200k_base": o200k_base,
97127
}

0 commit comments

Comments
 (0)