@@ -6,6 +6,7 @@ use std::num::NonZeroU64;
6
6
use std:: thread;
7
7
8
8
use fancy_regex:: Regex ;
9
+ use fancy_regex:: RegexBuilder ;
9
10
use pyo3:: exceptions;
10
11
use pyo3:: prelude:: * ;
11
12
use pyo3:: pyclass;
@@ -417,7 +418,7 @@ impl CoreBPE {
417
418
special_tokens_encoder : HashMap < String , Rank > ,
418
419
pattern : & str ,
419
420
) -> PyResult < Self > {
420
- let regex = Regex :: new ( pattern)
421
+ let regex = RegexBuilder :: new ( pattern) . backtrack_limit ( 10_000 ) . build ( )
421
422
. map_err ( |e| PyErr :: new :: < exceptions:: PyValueError , _ > ( e. to_string ( ) ) ) ?;
422
423
423
424
let special_regex = {
@@ -572,6 +573,7 @@ fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> {
572
573
573
574
#[ cfg( test) ]
574
575
mod tests {
576
+ use fancy_regex:: RegexBuilder ;
575
577
use rustc_hash:: FxHashMap as HashMap ;
576
578
577
579
use crate :: { byte_pair_split, Rank } ;
@@ -596,4 +598,16 @@ mod tests {
596
598
let res = byte_pair_split ( b"abab" , & ranks) ;
597
599
assert_eq ! ( res, vec![ b"ab" , b"ab" ] ) ;
598
600
}
601
+
602
+ #[ test]
603
+ fn test_effect_of_backtrack_limit ( ) {
604
+ let regex = RegexBuilder :: new ( r"(a|b|ab)*(?=c)" )
605
+ . backtrack_limit ( 10 )
606
+ . build ( )
607
+ . expect ( "Failed to build regex" )
608
+ . clone ( ) ;
609
+
610
+ let input = "ab" . repeat ( 100 ) + "c" ;
611
+ assert ! ( regex. is_match( & input) . is_err( ) , "Should throw" ) ;
612
+ }
599
613
}
0 commit comments