@@ -5,19 +5,13 @@ use std::{collections::HashMap, sync::Arc};
55use crate :: base:: field_attrs;
66use crate :: { fields_value, ops:: sdk:: * } ;
77
8- #[ derive( Debug , Deserialize ) ]
9- pub struct Spec {
10- #[ serde( default ) ]
11- language : Option < String > ,
12-
13- chunk_size : usize ,
14-
15- #[ serde( default ) ]
16- chunk_overlap : usize ,
17- }
8+ type Spec = EmptySpec ;
189
1910pub struct Args {
2011 text : ResolvedOpArg ,
12+ chunk_size : ResolvedOpArg ,
13+ chunk_overlap : Option < ResolvedOpArg > ,
14+ language : Option < ResolvedOpArg > ,
2115}
2216
2317static DEFAULT_SEPARATORS : LazyLock < Vec < Regex > > = LazyLock :: new ( || {
@@ -97,36 +91,13 @@ static SEPARATORS_BY_LANG: LazyLock<HashMap<&'static str, Vec<Regex>>> = LazyLoc
9791 . collect ( )
9892} ) ;
9993
100- struct Executor {
101- spec : Spec ,
102- args : Args ,
94+ struct SplitTask {
10395 separators : & ' static [ Regex ] ,
96+ chunk_size : usize ,
97+ chunk_overlap : usize ,
10498}
10599
106- impl Executor {
107- fn new ( spec : Spec , args : Args ) -> Result < Self > {
108- let separators = spec
109- . language
110- . as_ref ( )
111- . and_then ( |lang| {
112- SEPARATORS_BY_LANG
113- . get ( lang. to_lowercase ( ) . as_str ( ) )
114- . map ( |v| v. as_slice ( ) )
115- } )
116- . unwrap_or ( DEFAULT_SEPARATORS . as_slice ( ) ) ;
117- Ok ( Self {
118- spec,
119- args,
120- separators,
121- } )
122- }
123-
124- fn add_output < ' s > ( pos : usize , text : & ' s str , output : & mut Vec < ( RangeValue , & ' s str ) > ) {
125- if !text. trim ( ) . is_empty ( ) {
126- output. push ( ( RangeValue :: new ( pos, pos + text. len ( ) ) , text) ) ;
127- }
128- }
129-
100+ impl SplitTask {
130101 fn split_substring < ' s > (
131102 & self ,
132103 s : & ' s str ,
@@ -135,7 +106,7 @@ impl Executor {
135106 output : & mut Vec < ( RangeValue , & ' s str ) > ,
136107 ) {
137108 if next_sep_id >= self . separators . len ( ) {
138- Self :: add_output ( base_pos, s, output) ;
109+ self . add_output ( base_pos, s, output) ;
139110 return ;
140111 }
141112
@@ -147,17 +118,17 @@ impl Executor {
147118 let mut start_pos = chunks[ 0 ] . start ;
148119 for i in 1 ..chunks. len ( ) - 1 {
149120 let chunk = & chunks[ i] ;
150- if chunk. end - start_pos > self . spec . chunk_size {
151- Self :: add_output ( base_pos + start_pos, & s[ start_pos..chunk. end ] , output) ;
121+ if chunk. end - start_pos > self . chunk_size {
122+ self . add_output ( base_pos + start_pos, & s[ start_pos..chunk. end ] , output) ;
152123
153124 // Find the new start position, allowing overlap within the threshold.
154125 let mut new_start_idx = i + 1 ;
155126 let next_chunk = & chunks[ i + 1 ] ;
156127 while new_start_idx > 0 {
157128 let prev_pos = chunks[ new_start_idx - 1 ] . start ;
158129 if prev_pos <= start_pos
159- || chunk. end - prev_pos > self . spec . chunk_overlap
160- || next_chunk. end - prev_pos > self . spec . chunk_size
130+ || chunk. end - prev_pos > self . chunk_overlap
131+ || next_chunk. end - prev_pos > self . chunk_size
161132 {
162133 break ;
163134 }
@@ -168,32 +139,49 @@ impl Executor {
168139 }
169140
170141 let last_chunk = & chunks[ chunks. len ( ) - 1 ] ;
171- Self :: add_output ( base_pos + start_pos, & s[ start_pos..last_chunk. end ] , output) ;
142+ self . add_output ( base_pos + start_pos, & s[ start_pos..last_chunk. end ] , output) ;
172143 } ;
173144
174145 let mut small_chunks = Vec :: new ( ) ;
175- let mut process_chunk = |start : usize , end : usize | {
176- let chunk = & s[ start..end] ;
177- if chunk. len ( ) <= self . spec . chunk_size {
178- small_chunks. push ( RangeValue :: new ( start, start + chunk. len ( ) ) ) ;
179- } else {
180- flush_small_chunks ( & small_chunks, output) ;
181- small_chunks. clear ( ) ;
182- self . split_substring ( chunk, base_pos + start, next_sep_id + 1 , output) ;
183- }
184- } ;
146+ let mut process_chunk =
147+ |start : usize , end : usize , output : & mut Vec < ( RangeValue , & ' s str ) > | {
148+ let chunk = & s[ start..end] ;
149+ if chunk. len ( ) <= self . chunk_size {
150+ small_chunks. push ( RangeValue :: new ( start, start + chunk. len ( ) ) ) ;
151+ } else {
152+ flush_small_chunks ( & small_chunks, output) ;
153+ small_chunks. clear ( ) ;
154+ self . split_substring ( chunk, base_pos + start, next_sep_id + 1 , output) ;
155+ }
156+ } ;
185157
186158 let mut next_start_pos = 0 ;
187159 for cap in self . separators [ next_sep_id] . find_iter ( s) {
188- process_chunk ( next_start_pos, cap. start ( ) ) ;
160+ process_chunk ( next_start_pos, cap. start ( ) , output ) ;
189161 next_start_pos = cap. end ( ) ;
190162 }
191163 if next_start_pos < s. len ( ) {
192- process_chunk ( next_start_pos, s. len ( ) ) ;
164+ process_chunk ( next_start_pos, s. len ( ) , output ) ;
193165 }
194166
195167 flush_small_chunks ( & small_chunks, output) ;
196168 }
169+
170+ fn add_output < ' s > ( & self , pos : usize , text : & ' s str , output : & mut Vec < ( RangeValue , & ' s str ) > ) {
171+ if !text. trim ( ) . is_empty ( ) {
172+ output. push ( ( RangeValue :: new ( pos, pos + text. len ( ) ) , text) ) ;
173+ }
174+ }
175+ }
176+
177+ struct Executor {
178+ args : Args ,
179+ }
180+
181+ impl Executor {
182+ fn new ( args : Args ) -> Result < Self > {
183+ Ok ( Self { args } )
184+ }
197185}
198186
199187fn translate_bytes_to_chars < ' a > ( text : & str , offsets : impl Iterator < Item = & ' a mut usize > ) {
@@ -229,9 +217,32 @@ fn translate_bytes_to_chars<'a>(text: &str, offsets: impl Iterator<Item = &'a mu
229217#[ async_trait]
230218impl SimpleFunctionExecutor for Executor {
231219 async fn evaluate ( & self , input : Vec < Value > ) -> Result < Value > {
220+ let task = SplitTask {
221+ separators : self
222+ . args
223+ . language
224+ . value ( & input) ?
225+ . map ( |v| v. as_str ( ) )
226+ . transpose ( ) ?
227+ . and_then ( |lang| {
228+ SEPARATORS_BY_LANG
229+ . get ( lang. to_lowercase ( ) . as_str ( ) )
230+ . map ( |v| v. as_slice ( ) )
231+ } )
232+ . unwrap_or ( DEFAULT_SEPARATORS . as_slice ( ) ) ,
233+ chunk_size : self . args . chunk_size . value ( & input) ?. as_int64 ( ) ? as usize ,
234+ chunk_overlap : self
235+ . args
236+ . chunk_overlap
237+ . value ( & input) ?
238+ . map ( |v| v. as_int64 ( ) )
239+ . transpose ( ) ?
240+ . unwrap_or ( 0 ) as usize ,
241+ } ;
242+
232243 let text = self . args . text . value ( & input) ?. as_str ( ) ?;
233244 let mut output = Vec :: new ( ) ;
234- self . split_substring ( text, 0 , 0 , & mut output) ;
245+ task . split_substring ( text, 0 , 0 , & mut output) ;
235246
236247 translate_bytes_to_chars (
237248 text,
@@ -271,6 +282,15 @@ impl SimpleFunctionFactoryBase for Factory {
271282 text : args_resolver
272283 . next_arg ( "text" ) ?
273284 . expect_type ( & ValueType :: Basic ( BasicValueType :: Str ) ) ?,
285+ chunk_size : args_resolver
286+ . next_arg ( "chunk_size" ) ?
287+ . expect_type ( & ValueType :: Basic ( BasicValueType :: Int64 ) ) ?,
288+ chunk_overlap : args_resolver
289+ . next_optional_arg ( "chunk_overlap" ) ?
290+ . expect_type ( & ValueType :: Basic ( BasicValueType :: Int64 ) ) ?,
291+ language : args_resolver
292+ . next_optional_arg ( "language" ) ?
293+ . expect_type ( & ValueType :: Basic ( BasicValueType :: Str ) ) ?,
274294 } ;
275295 let output_schema = make_output_type ( CollectionSchema :: new (
276296 CollectionKind :: Table ,
@@ -288,10 +308,10 @@ impl SimpleFunctionFactoryBase for Factory {
288308
289309 async fn build_executor (
290310 self : Arc < Self > ,
291- spec : Spec ,
311+ _spec : Spec ,
292312 args : Args ,
293313 _context : Arc < FlowInstanceContext > ,
294314 ) -> Result < Box < dyn SimpleFunctionExecutor > > {
295- Ok ( Box :: new ( Executor :: new ( spec , args) ?) )
315+ Ok ( Box :: new ( Executor :: new ( args) ?) )
296316 }
297317}
0 commit comments