Skip to content

Commit 51c8a8a

Browse files
author
Lőrinc
committed
Fix whitespace catastrophic backtracking
1 parent 5f07fc2 commit 51c8a8a

File tree

2 files changed

+3
-3
lines changed

2 files changed

+3
-3
lines changed

tests/test_encoding.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
1515
def test_extremely_big_encoding(make_enc: Callable[[], tiktoken.Encoding]):
1616
enc = make_enc()
17-
for c in ["^", "0", "a", "'s"]: # TODO " ", "\n" are still failing
17+
for c in ["^", "0", "a", "'s", " ", "\n"]:
1818
print(f"Validating `{c}`")
1919

2020
big_value = c * 10_000

tiktoken_ext/openai_public.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
# The pattern in the original GPT-2 release is:
1010
# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
1111
# This is equivalent, but executes faster:
12-
_legacy_splitter_regex = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s++"""
12+
_legacy_splitter_regex = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s"""
1313

1414

1515
def gpt2():
@@ -84,7 +84,7 @@ def cl100k_base():
8484
}
8585
return {
8686
"name": "cl100k_base",
87-
"pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s*[\r\n]|\s+(?!\S)|\s++""",
87+
"pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""",
8888
"mergeable_ranks": mergeable_ranks,
8989
"special_tokens": special_tokens,
9090
}

0 commit comments

Comments
 (0)