Bump version, sync codebase

hauntsaninja · hauntsaninja · commit 446cb49affe9 · 2023-03-16T18:11:50.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.3.2]
+- Add encoding for GPT-4
+
 ## [v0.3.1]
 - Build aarch64 wheels
 - Make `blobfile` an optional dependency
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.3.1"
+version = "0.3.2"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/README.md b/README.md
@@ -5,11 +5,11 @@ OpenAI's models.
 
 ```python
 import tiktoken
-enc = tiktoken.get_encoding("gpt2")
+enc = tiktoken.get_encoding("cl100k_base")
 assert enc.decode(enc.encode("hello world")) == "hello world"
 
 # To get the tokeniser corresponding to a specific model in the OpenAI API:
-enc = tiktoken.encoding_for_model("text-davinci-003")
+enc = tiktoken.encoding_for_model("gpt-4")
 ```
 
 The open source version of `tiktoken` can be installed from PyPI:
diff --git a/src/lib.rs b/src/lib.rs
@@ -34,7 +34,7 @@ fn _byte_pair_merge<T>(
         }
     };
 
-    // We look up the ranks once in the beggining and iteratively update
+    // We look up the ranks once in the beginning and iteratively update
     // them during each merge, which reduces the number of rank lookups.
     for i in 0..parts.len() - 2 {
         match get_rank(&parts, i, 0) {
diff --git a/tiktoken/model.py b/tiktoken/model.py
@@ -6,11 +6,13 @@
 # TODO: these will likely be replaced by an API endpoint
 MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     # chat
-    "gpt-3.5-turbo-": "cl100k_base"  # e.g, gpt-3.5-turbo-0301, -0401, etc.
+    "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
+    "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
 }
 
 MODEL_TO_ENCODING: dict[str, str] = {
     # chat
+    "gpt-4": "cl100k_base",
     "gpt-3.5-turbo": "cl100k_base",
     # text
     "text-davinci-003": "p50k_base",

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ fn _byte_pair_merge<T>(`
`34`	`34`	`}`
`35`	`35`	`};`
`36`	`36`
`37`		`- // We look up the ranks once in the beggining and iteratively update`
	`37`	`+ // We look up the ranks once in the beginning and iteratively update`
`38`	`38`	`// them during each merge, which reduces the number of rank lookups.`
`39`	`39`	`for i in 0..parts.len() - 2 {`
`40`	`40`	`match get_rank(&parts, i, 0) {`