1+ use std:: time:: Instant ;
2+ use tokenizers:: pre_tokenizers:: whitespace:: { Whitespace , WhitespaceOptimized } ;
3+ use tokenizers:: { OffsetReferential , OffsetType , PreTokenizer , PreTokenizedString } ;
4+
5+ fn main ( ) {
6+ println ! ( "Whitespace Pre-Tokenizer Optimization Demo" ) ;
7+ println ! ( "==========================================\n " ) ;
8+
9+ // Test cases with different characteristics
10+ let test_cases = vec ! [
11+ ( "Simple text" , "Hello world! How are you doing?" ) ,
12+ ( "Mixed content" , "This is a test with numbers 123 and symbols @#$% and unicode: café résumé" ) ,
13+ ( "Whitespace heavy" , "Multiple spaces\t and\n newlines\r \n here" ) ,
14+ ( "Symbol heavy" , "Hello!@#$%^&*()world?><>{}[]|\\ " ) ,
15+ ( "Word heavy" , "This is a very long sentence with many words that should be tokenized properly" ) ,
16+ ( "Unicode heavy" , "αβγ δέζ ηθι κλμ νξο πρσ τυφ χψω" ) ,
17+ ( "Mixed unicode" , "Hello 123 αβγ !@# world δέζ ηθι" ) ,
18+ ] ;
19+
20+ for ( name, text) in test_cases {
21+ println ! ( "Test case: {}" , name) ;
22+ println ! ( "Input: '{}'" , text) ;
23+
24+ // Test original implementation
25+ let start = Instant :: now ( ) ;
26+ let mut original = PreTokenizedString :: from ( text) ;
27+ let original_pretok = Whitespace { } ;
28+ original_pretok. pre_tokenize ( & mut original) . unwrap ( ) ;
29+ let original_duration = start. elapsed ( ) ;
30+
31+ let original_splits = original
32+ . get_splits ( OffsetReferential :: Original , OffsetType :: Byte )
33+ . into_iter ( )
34+ . map ( |( s, o, _) | ( s, o) )
35+ . collect :: < Vec < _ > > ( ) ;
36+
37+ // Test optimized implementation
38+ let start = Instant :: now ( ) ;
39+ let mut optimized = PreTokenizedString :: from ( text) ;
40+ let optimized_pretok = WhitespaceOptimized { } ;
41+ optimized_pretok. pre_tokenize ( & mut optimized) . unwrap ( ) ;
42+ let optimized_duration = start. elapsed ( ) ;
43+
44+ let optimized_splits = optimized
45+ . get_splits ( OffsetReferential :: Original , OffsetType :: Byte )
46+ . into_iter ( )
47+ . map ( |( s, o, _) | ( s, o) )
48+ . collect :: < Vec < _ > > ( ) ;
49+
50+ // Verify compatibility
51+ let compatible = original_splits == optimized_splits;
52+
53+ println ! ( " Original tokens: {:?}" , original_splits) ;
54+ println ! ( " Optimized tokens: {:?}" , optimized_splits) ;
55+ println ! ( " Compatible: {}" , compatible) ;
56+ println ! ( " Original time: {:?}" , original_duration) ;
57+ println ! ( " Optimized time: {:?}" , optimized_duration) ;
58+
59+ if original_duration > optimized_duration {
60+ let speedup = original_duration. as_nanos ( ) as f64 / optimized_duration. as_nanos ( ) as f64 ;
61+ println ! ( " Speedup: {:.2}x" , speedup) ;
62+ } else {
63+ let slowdown = optimized_duration. as_nanos ( ) as f64 / original_duration. as_nanos ( ) as f64 ;
64+ println ! ( " Slowdown: {:.2}x" , slowdown) ;
65+ }
66+ println ! ( ) ;
67+ }
68+
69+ // Performance test with large text
70+ println ! ( "Large text performance test:" ) ;
71+ let base_text = "Hello world! This is a test with numbers 123 and symbols @#$% and unicode: café résumé. " ;
72+ let large_text: String = base_text. repeat ( 1000 ) ; // ~50KB of text
73+ println ! ( "Text size: {} bytes" , large_text. len( ) ) ;
74+
75+ // Warm up
76+ for _ in 0 ..10 {
77+ let mut _warmup = PreTokenizedString :: from ( & large_text) ;
78+ let _pretok = Whitespace { } ;
79+ // Don't actually call pre_tokenize to avoid affecting results
80+ }
81+
82+ // Benchmark original
83+ let iterations = 100 ;
84+ let start = Instant :: now ( ) ;
85+ for _ in 0 ..iterations {
86+ let mut pretokenized = PreTokenizedString :: from ( & large_text) ;
87+ let pretok = Whitespace { } ;
88+ pretok. pre_tokenize ( & mut pretokenized) . unwrap ( ) ;
89+ }
90+ let original_total = start. elapsed ( ) ;
91+ let original_avg = original_total / iterations;
92+
93+ // Benchmark optimized
94+ let start = Instant :: now ( ) ;
95+ for _ in 0 ..iterations {
96+ let mut pretokenized = PreTokenizedString :: from ( & large_text) ;
97+ let pretok = WhitespaceOptimized { } ;
98+ pretok. pre_tokenize ( & mut pretokenized) . unwrap ( ) ;
99+ }
100+ let optimized_total = start. elapsed ( ) ;
101+ let optimized_avg = optimized_total / iterations;
102+
103+ println ! ( " Original average: {:?}" , original_avg) ;
104+ println ! ( " Optimized average: {:?}" , optimized_avg) ;
105+
106+ if original_avg > optimized_avg {
107+ let speedup = original_avg. as_nanos ( ) as f64 / optimized_avg. as_nanos ( ) as f64 ;
108+ println ! ( " Overall speedup: {:.2}x" , speedup) ;
109+ } else {
110+ let slowdown = optimized_avg. as_nanos ( ) as f64 / original_avg. as_nanos ( ) as f64 ;
111+ println ! ( " Overall slowdown: {:.2}x" , slowdown) ;
112+ }
113+
114+ println ! ( "\n Note: Performance results may vary depending on hardware and system load." ) ;
115+ println ! ( "For accurate benchmarks, run: cargo bench --bench whitespace_benchmark" ) ;
116+ }
0 commit comments