Skip to content

Commit

Permalink
Drop legacy token sets
Browse files Browse the repository at this point in the history
  • Loading branch information
hendrikvanantwerpen committed Oct 18, 2024
1 parent ad953d9 commit f183341
Show file tree
Hide file tree
Showing 5 changed files with 1 addition and 40 deletions.
2 changes: 0 additions & 2 deletions crates/bpe-openai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ For convencience it re-exports the `bpe` crate so that depending on this crate i

Supported tokenizers:

- r50k
- p50k
- cl100k
- o200k

Expand Down
2 changes: 0 additions & 2 deletions crates/bpe-openai/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding};
use serde::Serialize;

fn main() {
serialize_tiktoken_bpe("r50k_base", include_bytes!("data/r50k_base.tiktoken.gz"), 1);
serialize_tiktoken_bpe("p50k_base", include_bytes!("data/p50k_base.tiktoken.gz"), 1);
serialize_tiktoken_bpe(
"cl100k_base",
include_bytes!("data/cl100k_base.tiktoken.gz"),
Expand Down
Binary file removed crates/bpe-openai/data/p50k_base.tiktoken.gz
Binary file not shown.
Binary file removed crates/bpe-openai/data/r50k_base.tiktoken.gz
Binary file not shown.
37 changes: 1 addition & 36 deletions crates/bpe-openai/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,6 @@ use fancy_regex::Regex;
// The look-ahead character is dropped from the match by the Pretokenizer iterator.
// Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!

static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
});

static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
});

static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
Expand Down Expand Up @@ -96,14 +82,6 @@ impl Tokenizer {
}
}

pub fn r50k_base() -> &'static Tokenizer {
&BPE_R50K_BASE
}

pub fn p50k_base() -> &'static Tokenizer {
&BPE_P50K_BASE
}

pub fn cl100k_base() -> &'static Tokenizer {
&BPE_CL100K_BASE
}
Expand All @@ -115,23 +93,10 @@ pub fn o200k_base() -> &'static Tokenizer {
#[cfg(test)]
mod tests {
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
use tiktoken_rs::{
cl100k_base_singleton, o200k_base_singleton, p50k_base_singleton, r50k_base_singleton,
CoreBPE,
};
use tiktoken_rs::{cl100k_base_singleton, o200k_base_singleton, CoreBPE};

use super::*;

#[test]
fn test_r50k() {
test_equivalence(r50k_base(), &r50k_base_singleton().lock());
}

#[test]
fn test_p50k() {
test_equivalence(p50k_base(), &p50k_base_singleton().lock());
}

#[test]
fn test_cl100k() {
test_equivalence(cl100k_base(), &cl100k_base_singleton().lock());
Expand Down

0 comments on commit f183341

Please sign in to comment.