diff --git a/crates/bpe-openai/README.md b/crates/bpe-openai/README.md index 0e25976..e5116e7 100644 --- a/crates/bpe-openai/README.md +++ b/crates/bpe-openai/README.md @@ -7,8 +7,6 @@ For convencience it re-exports the `bpe` crate so that depending on this crate i Supported tokenizers: -- r50k -- p50k - cl100k - o200k diff --git a/crates/bpe-openai/build.rs b/crates/bpe-openai/build.rs index 472e580..528eae6 100644 --- a/crates/bpe-openai/build.rs +++ b/crates/bpe-openai/build.rs @@ -7,8 +7,6 @@ use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding}; use serde::Serialize; fn main() { - serialize_tiktoken_bpe("r50k_base", include_bytes!("data/r50k_base.tiktoken.gz"), 1); - serialize_tiktoken_bpe("p50k_base", include_bytes!("data/p50k_base.tiktoken.gz"), 1); serialize_tiktoken_bpe( "cl100k_base", include_bytes!("data/cl100k_base.tiktoken.gz"), diff --git a/crates/bpe-openai/data/p50k_base.tiktoken.gz b/crates/bpe-openai/data/p50k_base.tiktoken.gz deleted file mode 100644 index af6f846..0000000 Binary files a/crates/bpe-openai/data/p50k_base.tiktoken.gz and /dev/null differ diff --git a/crates/bpe-openai/data/r50k_base.tiktoken.gz b/crates/bpe-openai/data/r50k_base.tiktoken.gz deleted file mode 100644 index 6108f82..0000000 Binary files a/crates/bpe-openai/data/r50k_base.tiktoken.gz and /dev/null differ diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs index 2bac02d..29f6ce4 100644 --- a/crates/bpe-openai/src/lib.rs +++ b/crates/bpe-openai/src/lib.rs @@ -8,20 +8,6 @@ use fancy_regex::Regex; // The look-ahead character is dropped from the match by the Pretokenizer iterator. // Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character! -static BPE_R50K_BASE: LazyLock = LazyLock::new(|| { - let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k_base.dict")); - let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); - let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+"; - Tokenizer::new(bpe, Some(pat)).expect("valid regex") -}); - -static BPE_P50K_BASE: LazyLock = LazyLock::new(|| { - let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict")); - let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); - let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+"; - Tokenizer::new(bpe, Some(pat)).expect("valid regex") -}); - static BPE_CL100K_BASE: LazyLock = LazyLock::new(|| { let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict")); let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); @@ -96,14 +82,6 @@ impl Tokenizer { } } -pub fn r50k_base() -> &'static Tokenizer { - &BPE_R50K_BASE -} - -pub fn p50k_base() -> &'static Tokenizer { - &BPE_P50K_BASE -} - pub fn cl100k_base() -> &'static Tokenizer { &BPE_CL100K_BASE } @@ -115,23 +93,10 @@ pub fn o200k_base() -> &'static Tokenizer { #[cfg(test)] mod tests { use bpe::byte_pair_encoding::{create_test_string, select_test_string}; - use tiktoken_rs::{ - cl100k_base_singleton, o200k_base_singleton, p50k_base_singleton, r50k_base_singleton, - CoreBPE, - }; + use tiktoken_rs::{cl100k_base_singleton, o200k_base_singleton, CoreBPE}; use super::*; - #[test] - fn test_r50k() { - test_equivalence(r50k_base(), &r50k_base_singleton().lock()); - } - - #[test] - fn test_p50k() { - test_equivalence(p50k_base(), &p50k_base_singleton().lock()); - } - #[test] fn test_cl100k() { test_equivalence(cl100k_base(), &cl100k_base_singleton().lock());