Skip to content

Commit 855666d

Browse files
authored
Specialize Prefix/Suffix Match for Like/ILike between Array and Scalar for StringViewArray (#6231)
* v2 impl * Add bench * fix clippy * fix endswith * Finalize the prefix_v2 implementation * stop reverse string for ends_with * Fix comments * fix bad comment * Correct equals sematics
1 parent f73dbc3 commit 855666d

File tree

3 files changed

+176
-17
lines changed

3 files changed

+176
-17
lines changed

arrow-array/src/array/byte_view_array.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, S
2424
use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
2525
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
2626
use arrow_schema::{ArrowError, DataType};
27+
use core::str;
2728
use num::ToPrimitive;
2829
use std::any::Any;
2930
use std::fmt::Debug;
@@ -301,6 +302,69 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
301302
ArrayIter::new(self)
302303
}
303304

305+
/// Returns an iterator over the bytes of this array.
306+
pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
307+
self.views.iter().map(move |v| {
308+
let len = *v as u32;
309+
if len <= 12 {
310+
unsafe { Self::inline_value(v, len as usize) }
311+
} else {
312+
let view = ByteView::from(*v);
313+
let data = &self.buffers[view.buffer_index as usize];
314+
let offset = view.offset as usize;
315+
unsafe { data.get_unchecked(offset..offset + len as usize) }
316+
}
317+
})
318+
}
319+
320+
/// Returns an iterator over the prefix bytes of this array with respect to the prefix length.
321+
/// If the prefix length is larger than the string length, it will return the empty slice.
322+
pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
323+
self.views().into_iter().map(move |v| {
324+
let len = (*v as u32) as usize;
325+
326+
if len < prefix_len {
327+
return &[] as &[u8];
328+
}
329+
330+
if prefix_len <= 4 || len <= 12 {
331+
unsafe { StringViewArray::inline_value(v, prefix_len) }
332+
} else {
333+
let view = ByteView::from(*v);
334+
let data = unsafe {
335+
self.data_buffers()
336+
.get_unchecked(view.buffer_index as usize)
337+
};
338+
let offset = view.offset as usize;
339+
unsafe { data.get_unchecked(offset..offset + prefix_len) }
340+
}
341+
})
342+
}
343+
344+
/// Returns an iterator over the suffix bytes of this array with respect to the suffix length.
345+
/// If the suffix length is larger than the string length, it will return the empty slice.
346+
pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
347+
self.views().into_iter().map(move |v| {
348+
let len = (*v as u32) as usize;
349+
350+
if len < suffix_len {
351+
return &[] as &[u8];
352+
}
353+
354+
if len <= 12 {
355+
unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
356+
} else {
357+
let view = ByteView::from(*v);
358+
let data = unsafe {
359+
self.data_buffers()
360+
.get_unchecked(view.buffer_index as usize)
361+
};
362+
let offset = view.offset as usize;
363+
unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
364+
}
365+
})
366+
}
367+
304368
/// Returns a zero-copy slice of this array with the indicated offset and length.
305369
pub fn slice(&self, offset: usize, length: usize) -> Self {
306370
Self {

arrow-string/src/like.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,27 @@ mod tests {
989989
vec![false, true, true, false, false, false, false, true, true, true, true]
990990
);
991991

992+
// 😈 is four bytes long.
993+
test_utf8_scalar!(
994+
test_uff8_array_like_multibyte,
995+
vec![
996+
"sdlkdfFooßsdfs",
997+
"sdlkdfFooSSdggs",
998+
"sdlkdfFoosssdsd",
999+
"FooS",
1000+
"Foos",
1001+
"ffooSS",
1002+
"ffooß",
1003+
"😃sadlksffofsSsh😈klF",
1004+
"😱slgffoesSsh😈klF",
1005+
"FFKoSS",
1006+
"longer than 12 bytes FFKoSS",
1007+
],
1008+
"%Ssh😈klF",
1009+
like,
1010+
vec![false, false, false, false, false, false, false, true, true, false, false]
1011+
);
1012+
9921013
test_utf8_scalar!(
9931014
test_utf8_array_ilike_scalar_one,
9941015
vec![

arrow-string/src/predicate.rs

Lines changed: 91 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use arrow_array::{ArrayAccessor, BooleanArray};
18+
use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray};
1919
use arrow_schema::ArrowError;
2020
use memchr::memchr2;
2121
use memchr::memmem::Finder;
@@ -114,28 +114,103 @@ impl<'a> Predicate<'a> {
114114
Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| {
115115
haystack.eq_ignore_ascii_case(v) != negate
116116
}),
117-
Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| {
118-
finder.find(haystack.as_bytes()).is_some() != negate
119-
}),
120-
Predicate::StartsWith(v) => BooleanArray::from_unary(array, |haystack| {
121-
starts_with(haystack, v, equals_kernel) != negate
122-
}),
123-
Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, |haystack| {
124-
starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
125-
}),
126-
Predicate::EndsWith(v) => BooleanArray::from_unary(array, |haystack| {
127-
ends_with(haystack, v, equals_kernel) != negate
128-
}),
129-
Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, |haystack| {
130-
ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
131-
}),
117+
Predicate::Contains(finder) => {
118+
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
119+
BooleanArray::from(
120+
string_view_array
121+
.bytes_iter()
122+
.map(|haystack| finder.find(haystack).is_some() != negate)
123+
.collect::<Vec<_>>(),
124+
)
125+
} else {
126+
BooleanArray::from_unary(array, |haystack| {
127+
finder.find(haystack.as_bytes()).is_some() != negate
128+
})
129+
}
130+
}
131+
Predicate::StartsWith(v) => {
132+
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
133+
BooleanArray::from(
134+
string_view_array
135+
.prefix_bytes_iter(v.len())
136+
.map(|haystack| {
137+
equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate
138+
})
139+
.collect::<Vec<_>>(),
140+
)
141+
} else {
142+
BooleanArray::from_unary(array, |haystack| {
143+
starts_with(haystack, v, equals_kernel) != negate
144+
})
145+
}
146+
}
147+
Predicate::IStartsWithAscii(v) => {
148+
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
149+
BooleanArray::from(
150+
string_view_array
151+
.prefix_bytes_iter(v.len())
152+
.map(|haystack| {
153+
equals_bytes(
154+
haystack,
155+
v.as_bytes(),
156+
equals_ignore_ascii_case_kernel,
157+
) != negate
158+
})
159+
.collect::<Vec<_>>(),
160+
)
161+
} else {
162+
BooleanArray::from_unary(array, |haystack| {
163+
starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
164+
})
165+
}
166+
}
167+
Predicate::EndsWith(v) => {
168+
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
169+
BooleanArray::from(
170+
string_view_array
171+
.suffix_bytes_iter(v.len())
172+
.map(|haystack| {
173+
equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate
174+
})
175+
.collect::<Vec<_>>(),
176+
)
177+
} else {
178+
BooleanArray::from_unary(array, |haystack| {
179+
ends_with(haystack, v, equals_kernel) != negate
180+
})
181+
}
182+
}
183+
Predicate::IEndsWithAscii(v) => {
184+
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
185+
BooleanArray::from(
186+
string_view_array
187+
.suffix_bytes_iter(v.len())
188+
.map(|haystack| {
189+
equals_bytes(
190+
haystack,
191+
v.as_bytes(),
192+
equals_ignore_ascii_case_kernel,
193+
) != negate
194+
})
195+
.collect::<Vec<_>>(),
196+
)
197+
} else {
198+
BooleanArray::from_unary(array, |haystack| {
199+
ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
200+
})
201+
}
202+
}
132203
Predicate::Regex(v) => {
133204
BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate)
134205
}
135206
}
136207
}
137208
}
138209

210+
fn equals_bytes(lhs: &[u8], rhs: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool {
211+
lhs.len() == rhs.len() && zip(lhs, rhs).all(byte_eq_kernel)
212+
}
213+
139214
/// This is faster than `str::starts_with` for small strings.
140215
/// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
141216
fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool {
@@ -145,7 +220,6 @@ fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8))
145220
zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel)
146221
}
147222
}
148-
149223
/// This is faster than `str::ends_with` for small strings.
150224
/// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
151225
fn ends_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool {

0 commit comments

Comments
 (0)