Skip to content

Commit 59320c6

Browse files
authored
Refactor char/string and byte search (#54667)
This is a refactoring of `base/string/search.jl`. It is purely internal, and comes with no changes in behaviour. It's based on #54593 and #54579, so those needs to get merged first, then this PR will be rebased onto master. Included changes are: * The char/string search functions now use the last byte to memchr, not the first byte. Because the last bytes are more varied, this is much faster on small non-ASCII alphabets (like searching Greek or Cyrillic text) and somewhat faster on large non-ASCII ones (like Japanese). Speed on ASCII alphabets (like English) in unchanged. * Several unused or redundant methods have been removed * Moved boundschecks from the inner `_search` and `_rsearch` functions to the outer top-level functions that call them. This is because the former may be called in a loop where repeated boundschecking is needless. This should speed up search a bit. * Char/string search functions are now implemented in terms of an internal lazy iterator. This allows `findall` and `findnext` to share implementation, and will also make it trivially easy to implement a lazy findall in the future (see #43737) IMO there is still more work to be done on this file, but this requires a decision to be made on #43737, #54581 or #54584 ## Benchmarks ```julia using BenchmarkTools using Random rng = Xoshiro(55) greek = join(rand(rng, 'Α':'ψ', 100000)) * 'ω' @Btime findfirst('ω', greek) @Btime findfirst(==('\xce'), greek) english = join(rand(rng, 'A':'y', 100000)) * 'z' @Btime findfirst('z', english) @Btime findall('A', english) @Btime findall('\xff', english) nothing ``` 1.11.0-beta2: ``` 100.049 μs (1 allocation: 16 bytes) 474.084 μs (0 allocations: 0 bytes) 689.110 ns (1 allocation: 16 bytes) 93.536 μs (9 allocations: 21.84 KiB) 72.316 μs (1 allocation: 32 bytes) ``` This PR: ``` 1.319 μs (1 allocation: 16 bytes) 398.011 μs (0 allocations: 0 bytes) 681.550 ns (1 allocation: 16 bytes) 8.867 μs (8 allocations: 21.81 KiB) 683.962 ns (1 allocation: 32 bytes) ```
1 parent 3be18c3 commit 59320c6

File tree

5 files changed

+212
-97
lines changed

5 files changed

+212
-97
lines changed

base/char.jl

-3
Original file line numberDiff line numberDiff line change
@@ -224,9 +224,6 @@ isless(x::Char, y::Char) = bitcast(UInt32, x) < bitcast(UInt32, y)
224224
hash(x::Char, h::UInt) =
225225
hash_uint64(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) UInt64(h))
226226

227-
first_utf8_byte(c::Char) = (bitcast(UInt32, c) >> 24) % UInt8
228-
first_utf8_byte(c::AbstractChar) = first_utf8_byte(Char(c)::Char)
229-
230227
# fallbacks:
231228
isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
232229
==(x::AbstractChar, y::AbstractChar) = Char(x) == Char(y)

base/strings/search.jl

+169-91
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,6 @@ const DenseUInt8 = Union{
3131

3232
const DenseUInt8OrInt8 = Union{DenseUInt8, DenseInt8}
3333

34-
last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
35-
last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)
36-
3734
function last_utf8_byte(c::Char)
3835
u = reinterpret(UInt32, c)
3936
shift = ((4 - ncodeunits(c)) * 8) & 31
@@ -44,144 +41,226 @@ end
4441
# This holds even in the presence of invalid UTF8
4542
is_standalone_byte(x::UInt8) = (x < 0x80) | (x > 0xf7)
4643

47-
function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
48-
s::Union{String, SubString{String}}, i::Integer)
49-
if i < 1 || i > sizeof(s)
50-
i == sizeof(s) + 1 && return nothing
51-
throw(BoundsError(s, i))
44+
last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
45+
last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)
46+
47+
# Internal type - lazy iterator over positions of char in string
48+
struct FwCharPosIter{S}
49+
string::S # S is assumed to be either String or SubString{String}
50+
char::Char
51+
# Char searchers search for the last UTF8 byte, because this byte tends to
52+
# have the most variety in real texts, so any individual value is rarer.
53+
# This allows more work to be done in the fast path using memchr.
54+
last_char_byte::UInt8
55+
end
56+
57+
function FwCharPosIter(s::Union{String, SubString{String}}, c::AbstractChar)
58+
char = Char(c)::Char
59+
byte = last_utf8_byte(char)
60+
FwCharPosIter{typeof(s)}(s, char, byte)
61+
end
62+
63+
# i is the index in the string to search from.
64+
# We assume it's never < firstindex(s.string)
65+
function Base.iterate(s::FwCharPosIter, i::Int=1)
66+
scu = ncodeunits(s.string)
67+
68+
# By definition, if the last byte is a standalone byte, then the char
69+
# is a single-byte char where the byte can never be a subset of another char.
70+
# Hence, we can simply search for the occurrence of the byte itself.
71+
if is_standalone_byte(s.last_char_byte)
72+
i > scu && return nothing
73+
i = _search(s.string, s.last_char_byte, i)
74+
i === nothing ? nothing : (i, i + 1)
75+
else
76+
ncu = ncodeunits(s.char)
77+
while true
78+
i > scu && return nothing
79+
i = _search(s.string, s.last_char_byte, i)
80+
i === nothing && return nothing
81+
# Increment i before the continue to avoid infinite loop.
82+
# Since we search for the last byte in the char, the index has an offset.
83+
i += 1
84+
index = i - ncu
85+
# The byte may be part of a different char, in which case index
86+
# may be invalid.
87+
isvalid(s.string, index) || continue
88+
# Here, we use iterate instead of indexing, because indexing needlessly
89+
# re-validates the index which we have already done here.
90+
# This relies on the implementation detail that the iterator state for
91+
# iterating strings is the same as the byte index.
92+
char = first(something(iterate(s.string, index)))
93+
char == s.char && return (index, i)
94+
end
5295
end
53-
@inbounds isvalid(s, i) || string_index_err(s, i)
54-
c = pred.x
55-
c '\x7f' && return _search(s, first_utf8_byte(c), i)
56-
while true
57-
i = _search(s, first_utf8_byte(c), i)
58-
i === nothing && return nothing
59-
isvalid(s, i) && pred(s[i]) && return i
60-
i = nextind(s, i)
96+
end
97+
98+
# Internal type - lazy iterator over positions of char in string, in reverse order
99+
struct RvCharPosIter{S}
100+
string::S # S is assumed to be either String or SubString{String}
101+
char::Char
102+
last_char_byte::UInt8
103+
end
104+
105+
IteratorSize(s::Type{<:Union{FwCharPosIter, RvCharPosIter}}) = SizeUnknown()
106+
eltype(::Type{<:Union{FwCharPosIter, RvCharPosIter}}) = Int
107+
108+
function RvCharPosIter(s::Union{String, SubString{String}}, c::AbstractChar)
109+
char = Char(c)::Char
110+
byte = last_utf8_byte(char)
111+
RvCharPosIter{typeof(s)}(s, char, byte)
112+
end
113+
114+
# i is the index in the string to search from
115+
# We assume it's never > ncodeunits(s.string)
116+
# This is the same implementation as FwCharPosIter, except for two differences:
117+
# 1. i must be decremented, not incremented because we are searching backwards
118+
# 2. Because we search for the last byte, the starting value of i need to be
119+
# incremented in the beginning, as that byte may be found at i + ncodeunits(char) - 1.
120+
function Base.iterate(s::RvCharPosIter, i::Int=ncodeunits(s.string))
121+
ncu = ncodeunits(s.char)
122+
if is_standalone_byte(s.last_char_byte)
123+
i < ncu && return nothing
124+
i = _rsearch(s.string, s.last_char_byte, i)
125+
i === nothing ? nothing : (i, i - 1)
126+
else
127+
i = min(ncodeunits(s.string), i + ncu - 1)
128+
while true
129+
i < ncu && return nothing
130+
i = _rsearch(s.string, s.last_char_byte, i)
131+
i === nothing && return nothing
132+
index = i - ncu + 1
133+
i -= 1
134+
isvalid(s.string, index) || continue
135+
char = first(something(iterate(s.string, index)))
136+
char == s.char && return (index, i)
137+
end
61138
end
62139
end
63140

64-
function findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{UInt8, Int8}}, a::Union{DenseInt8, DenseUInt8})
65-
findnext(pred, a, firstindex(a))
141+
function try_next(x, state)
142+
y = iterate(x, state)
143+
y === nothing ? nothing : first(y)
144+
end
145+
146+
function findnext(
147+
pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
148+
s::Union{String, SubString{String}},
149+
i::Integer,
150+
)
151+
# TODO: Redesign these strange rules for errors, see #54584
152+
scu = ncodeunits(s)
153+
i == scu + 1 && return nothing
154+
@boundscheck if i < 1 || i > scu + 1
155+
throw(BoundsError(s, i))
156+
end
157+
# The most common case is probably searching for an ASCII char.
158+
# We inline this critical path here to avoid instantiating a
159+
# FwCharPosIter in the common case.
160+
c = Char(pred.x)::Char
161+
u = (reinterpret(UInt32, c) >> 24) % UInt8
162+
i = Int(i)::Int
163+
isvalid(s, i) || string_index_err(s, i)
164+
return if is_standalone_byte(u)
165+
_search(s, u, i)
166+
else
167+
try_next(FwCharPosIter(s, c, last_utf8_byte(c)), i)
168+
end
66169
end
67170

68171
function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
172+
@boundscheck i < firstindex(a) && throw(BoundsError(a, i))
173+
i > lastindex(a) && return nothing
69174
_search(a, pred.x, i)
70175
end
71176

72177
function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
178+
@boundscheck i < firstindex(a) && throw(BoundsError(a, i))
179+
i > lastindex(a) && return nothing
73180
_search(a, pred.x, i)
74181
end
75182

76183
# iszero is special, in that the bitpattern for zero for Int8 and UInt8 is the same,
77184
# so we can use memchr even if we search for an Int8 in an UInt8 array or vice versa
78-
findfirst(::typeof(iszero), a::DenseUInt8OrInt8) = _search(a, zero(UInt8))
79-
findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _search(a, zero(UInt8), i)
185+
function findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer)
186+
@boundscheck i < firstindex(a) && throw(BoundsError(a, i))
187+
i > lastindex(a) && return nothing
188+
_search(a, zero(UInt8), i)
189+
end
80190

191+
# This is essentially just a wrapper around memchr. i must be inbounds.
81192
function _search(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = firstindex(a))
82193
fst = firstindex(a)
83-
lst = last_byteindex(a)
84-
if i < fst
85-
throw(BoundsError(a, i))
86-
end
87-
n_bytes = lst - i + 1
88-
if i > lst
89-
return i == lst+1 ? nothing : throw(BoundsError(a, i))
90-
end
91194
GC.@preserve a begin
92195
p = pointer(a)
93-
q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, n_bytes)
196+
q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, last_byteindex(a) - i + 1)
94197
end
95198
return q == C_NULL ? nothing : (q-p+fst) % Int
96199
end
97200

98-
function _search(a::DenseUInt8, b::AbstractChar, i::Integer = firstindex(a))
99-
if isascii(b)
100-
_search(a,UInt8(b),i)
101-
else
102-
_search(a,codeunits(string(b)),i).start
201+
function findprev(
202+
pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
203+
s::Union{String, SubString{String}},
204+
i::Integer,
205+
)
206+
# TODO: Redesign these strange rules for errors, see #54584
207+
if i == ncodeunits(s) + 1 || i == 0
208+
return nothing
103209
end
104-
end
105-
106-
function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
107-
s::Union{String, SubString{String}}, i::Integer)
108-
c = pred.x
109-
c '\x7f' && return _rsearch(s, first_utf8_byte(c), i)
110-
b = first_utf8_byte(c)
111-
while true
112-
i = _rsearch(s, b, i)
113-
i == nothing && return nothing
114-
isvalid(s, i) && pred(s[i]) && return i
115-
i = prevind(s, i)
210+
@boundscheck if i < 1 || i > ncodeunits(s) + 1
211+
throw(BoundsError(s, i))
212+
end
213+
# Manually inline the fast path if c is ASCII, as we expect it to often be
214+
c = Char(pred.x)::Char
215+
u = (reinterpret(UInt32, c) >> 24) % UInt8
216+
i = Int(i)::Int
217+
return if is_standalone_byte(u)
218+
_rsearch(s, u, i)
219+
else
220+
try_next(RvCharPosIter(s, c, last_utf8_byte(c)), i)
116221
end
117-
end
118-
119-
function findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::DenseUInt8OrInt8)
120-
findprev(pred, a, lastindex(a))
121222
end
122223

123224
function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
225+
@boundscheck i > lastindex(a) && throw(BoundsError(a, i))
226+
i < firstindex(a) && return nothing
124227
_rsearch(a, pred.x, i)
125228
end
126229

127230
function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
231+
@boundscheck i > lastindex(a) && throw(BoundsError(a, i))
232+
i < firstindex(a) && return nothing
128233
_rsearch(a, pred.x, i)
129234
end
130235

131236
# See comments above for findfirst(::typeof(iszero)) methods
132-
findlast(::typeof(iszero), a::DenseUInt8OrInt8) = _rsearch(a, zero(UInt8))
133-
findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _rsearch(a, zero(UInt8), i)
237+
function findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer)
238+
@boundscheck i > lastindex(a) && throw(BoundsError(a, i))
239+
i < firstindex(a) && return nothing
240+
_rsearch(a, zero(UInt8), i)
241+
end
134242

243+
# This is essentially just a wrapper around memrchr. i must be inbounds.
135244
function _rsearch(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = last_byteindex(a))
136245
fst = firstindex(a)
137-
lst = last_byteindex(a)
138-
if i < fst
139-
return i == fst - 1 ? nothing : throw(BoundsError(a, i))
140-
end
141-
if i > lst
142-
return i == lst+1 ? nothing : throw(BoundsError(a, i))
143-
end
144246
GC.@preserve a begin
145247
p = pointer(a)
146248
q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i-fst+1)
147249
end
148250
return q == C_NULL ? nothing : (q-p+fst) % Int
149251
end
150252

151-
function _rsearch(a::DenseUInt8, b::AbstractChar, i::Integer = length(a))
152-
if isascii(b)
153-
_rsearch(a,UInt8(b),i)
154-
else
155-
_rsearch(a,codeunits(string(b)),i).start
156-
end
157-
end
158-
159253
function findall(
160254
pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
161-
s::Union{String, SubString{String}}
255+
s::Union{String, SubString{String}},
162256
)
163-
c = Char(pred.x)::Char
164-
byte = last_utf8_byte(c)
165-
ncu = ncodeunits(c)
166-
167-
# If only one byte, and can't be part of another Char: Forward to memchr.
168-
is_standalone_byte(byte) && return findall(==(byte), codeunits(s))
169-
result = Int[]
170-
i = firstindex(s)
171-
while true
172-
i = _search(s, byte, i)
173-
isnothing(i) && return result
174-
i += 1
175-
index = i - ncu
176-
# If the char is invalid, it's possible that its first byte is
177-
# inside another char. If so, indexing into the string will throw an
178-
# error, so we need to check for valid indices.
179-
isvalid(s, index) || continue
180-
# We use iterate here instead of indexing, because indexing wastefully
181-
# checks for valid index. It would be better if there was something like
182-
# try_getindex(::String, ::Int) we could use.
183-
char = first(something(iterate(s, index)))
184-
pred(char) && push!(result, index)
257+
iter = FwCharPosIter(s, pred.x)
258+
return if is_standalone_byte(iter.last_char_byte)
259+
findall(==(iter.last_char_byte), codeunits(s))
260+
else
261+
# It is slightly wasteful that every iteration will check is_standalone_byte
262+
# again, but this should only be minor overhead in the non-fast path.
263+
collect(iter)
185264
end
186265
end
187266

@@ -255,7 +334,6 @@ function findnext(testf::Function, s::AbstractString, i::Integer)
255334
return nothing
256335
end
257336

258-
259337
in(c::AbstractChar, s::AbstractString) = (findfirst(isequal(c),s)!==nothing)
260338

261339
function _searchindex(s::Union{AbstractString,DenseUInt8OrInt8},

base/util.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -508,7 +508,7 @@ unsafe_crc32c(a, n, crc) = ccall(:jl_crc32c, UInt32, (UInt32, Ptr{UInt8}, Csize_
508508
_crc32c(a::NTuple{<:Any, UInt8}, crc::UInt32=0x00000000) =
509509
unsafe_crc32c(Ref(a), length(a) % Csize_t, crc)
510510

511-
function _crc32c(a::DenseBytes, crc::UInt32=0x00000000)
511+
function _crc32c(a::DenseUInt8OrInt8, crc::UInt32=0x00000000)
512512
unsafe_crc32c(a, length(a) % Csize_t, crc)
513513
end
514514

stdlib/CRC32c/src/CRC32c.jl

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ See [`CRC32c.crc32c`](@ref) for more information.
77
"""
88
module CRC32c
99

10-
import Base: DenseBytes
10+
import Base.FastContiguousSubArray
11+
import Base: DenseUInt8OrInt8
1112

1213
export crc32c
1314

@@ -50,7 +51,7 @@ function crc32c(a::AbstractVector{UInt8}, crc::UInt32=0x00000000)
5051
return crc
5152
end
5253

53-
function crc32c(a::DenseBytes, crc::UInt32=0x00000000)
54+
function crc32c(a::DenseUInt8OrInt8, crc::UInt32=0x00000000)
5455
Base._crc32c(a, crc)
5556
end
5657

0 commit comments

Comments
 (0)