Refactor char/string and byte search (#54667)

jakobnissen · web-flow · commit 59320c62993c · 2025-03-06T13:11:20.000+01:00
This is a refactoring of `base/string/search.jl`. It is purely internal, and comes with no changes in behaviour. It's based on #54593 and #54579, so those needs to get merged first, then this PR will be rebased onto master. Included changes are: * The char/string search functions now use the last byte to memchr, not the first byte. Because the last bytes are more varied, this is much faster on small non-ASCII alphabets (like searching Greek or Cyrillic text) and somewhat faster on large non-ASCII ones (like Japanese). Speed on ASCII alphabets (like English) in unchanged. * Several unused or redundant methods have been removed * Moved boundschecks from the inner `_search` and `_rsearch` functions to the outer top-level functions that call them. This is because the former may be called in a loop where repeated boundschecking is needless. This should speed up search a bit. * Char/string search functions are now implemented in terms of an internal lazy iterator. This allows `findall` and `findnext` to share implementation, and will also make it trivially easy to implement a lazy findall in the future (see #43737) IMO there is still more work to be done on this file, but this requires a decision to be made on #43737, #54581 or #54584 ## Benchmarks ```julia using BenchmarkTools using Random rng = Xoshiro(55) greek = join(rand(rng, 'Α':'ψ', 100000)) * 'ω' @Btime findfirst('ω', greek) @Btime findfirst(==('\xce'), greek) english = join(rand(rng, 'A':'y', 100000)) * 'z' @Btime findfirst('z', english) @Btime findall('A', english) @Btime findall('\xff', english) nothing ``` 1.11.0-beta2: ``` 100.049 μs (1 allocation: 16 bytes) 474.084 μs (0 allocations: 0 bytes) 689.110 ns (1 allocation: 16 bytes) 93.536 μs (9 allocations: 21.84 KiB) 72.316 μs (1 allocation: 32 bytes) ``` This PR: ``` 1.319 μs (1 allocation: 16 bytes) 398.011 μs (0 allocations: 0 bytes) 681.550 ns (1 allocation: 16 bytes) 8.867 μs (8 allocations: 21.81 KiB) 683.962 ns (1 allocation: 32 bytes) ```
diff --git a/base/char.jl b/base/char.jl
@@ -224,9 +224,6 @@ isless(x::Char, y::Char) = bitcast(UInt32, x) < bitcast(UInt32, y)
 hash(x::Char, h::UInt) =
     hash_uint64(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))
 
-first_utf8_byte(c::Char) = (bitcast(UInt32, c) >> 24) % UInt8
-first_utf8_byte(c::AbstractChar) = first_utf8_byte(Char(c)::Char)
-
 # fallbacks:
 isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
 ==(x::AbstractChar, y::AbstractChar) = Char(x) == Char(y)
diff --git a/base/strings/search.jl b/base/strings/search.jl
@@ -31,9 +31,6 @@ const DenseUInt8 = Union{
 
 const DenseUInt8OrInt8 = Union{DenseUInt8, DenseInt8}
 
-last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
-last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)
-
 function last_utf8_byte(c::Char)
     u = reinterpret(UInt32, c)
     shift = ((4 - ncodeunits(c)) * 8) & 31
@@ -44,144 +41,226 @@ end
 # This holds even in the presence of invalid UTF8
 is_standalone_byte(x::UInt8) = (x < 0x80) | (x > 0xf7)
 
-function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
-                  s::Union{String, SubString{String}}, i::Integer)
-    if i < 1 || i > sizeof(s)
-        i == sizeof(s) + 1 && return nothing
-        throw(BoundsError(s, i))
+last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
+last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)
+
+# Internal type - lazy iterator over positions of char in string
+struct FwCharPosIter{S}
+    string::S # S is assumed to be either String or SubString{String}
+    char::Char
+    # Char searchers search for the last UTF8 byte, because this byte tends to
+    # have the most variety in real texts, so any individual value is rarer.
+    # This allows more work to be done in the fast path using memchr.
+    last_char_byte::UInt8
+end
+
+function FwCharPosIter(s::Union{String, SubString{String}}, c::AbstractChar)
+    char = Char(c)::Char
+    byte = last_utf8_byte(char)
+    FwCharPosIter{typeof(s)}(s, char, byte)
+end
+
+# i is the index in the string to search from.
+# We assume it's never < firstindex(s.string)
+function Base.iterate(s::FwCharPosIter, i::Int=1)
+    scu = ncodeunits(s.string)
+
+    # By definition, if the last byte is a standalone byte, then the char
+    # is a single-byte char where the byte can never be a subset of another char.
+    # Hence, we can simply search for the occurrence of the byte itself.
+    if is_standalone_byte(s.last_char_byte)
+        i > scu && return nothing
+        i = _search(s.string, s.last_char_byte, i)
+        i === nothing ? nothing : (i, i + 1)
+    else
+        ncu = ncodeunits(s.char)
+        while true
+            i > scu && return nothing
+            i = _search(s.string, s.last_char_byte, i)
+            i === nothing && return nothing
+            # Increment i before the continue to avoid infinite loop.
+            # Since we search for the last byte in the char, the index has an offset.
+            i += 1
+            index = i - ncu
+            # The byte may be part of a different char, in which case index
+            # may be invalid.
+            isvalid(s.string, index) || continue
+            # Here, we use iterate instead of indexing, because indexing needlessly
+            # re-validates the index which we have already done here.
+            # This relies on the implementation detail that the iterator state for
+            # iterating strings is the same as the byte index.
+            char = first(something(iterate(s.string, index)))
+            char == s.char && return (index, i)
+        end
     end
-    @inbounds isvalid(s, i) || string_index_err(s, i)
-    c = pred.x
-    c ≤ '\x7f' && return _search(s, first_utf8_byte(c), i)
-    while true
-        i = _search(s, first_utf8_byte(c), i)
-        i === nothing && return nothing
-        isvalid(s, i) && pred(s[i]) && return i
-        i = nextind(s, i)
+end
+
+# Internal type - lazy iterator over positions of char in string, in reverse order
+struct RvCharPosIter{S}
+    string::S # S is assumed to be either String or SubString{String}
+    char::Char
+    last_char_byte::UInt8
+end
+
+IteratorSize(s::Type{<:Union{FwCharPosIter, RvCharPosIter}}) = SizeUnknown()
+eltype(::Type{<:Union{FwCharPosIter, RvCharPosIter}}) = Int
+
+function RvCharPosIter(s::Union{String, SubString{String}}, c::AbstractChar)
+    char = Char(c)::Char
+    byte = last_utf8_byte(char)
+    RvCharPosIter{typeof(s)}(s, char, byte)
+end
+
+# i is the index in the string to search from
+# We assume it's never > ncodeunits(s.string)
+# This is the same implementation as FwCharPosIter, except for two differences:
+# 1. i must be decremented, not incremented because we are searching backwards
+# 2. Because we search for the last byte, the starting value of i need to be
+#    incremented in the beginning, as that byte may be found at i + ncodeunits(char) - 1.
+function Base.iterate(s::RvCharPosIter, i::Int=ncodeunits(s.string))
+    ncu = ncodeunits(s.char)
+    if is_standalone_byte(s.last_char_byte)
+        i < ncu && return nothing
+        i = _rsearch(s.string, s.last_char_byte, i)
+        i === nothing ? nothing : (i, i - 1)
+    else
+        i = min(ncodeunits(s.string), i + ncu - 1)
+        while true
+            i < ncu && return nothing
+            i = _rsearch(s.string, s.last_char_byte, i)
+            i === nothing && return nothing
+            index = i - ncu + 1
+            i -= 1
+            isvalid(s.string, index) || continue
+            char = first(something(iterate(s.string, index)))
+            char == s.char && return (index, i)
+        end
     end
 end
 
-function findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{UInt8, Int8}}, a::Union{DenseInt8, DenseUInt8})
-    findnext(pred, a, firstindex(a))
+function try_next(x, state)
+    y = iterate(x, state)
+    y === nothing ? nothing : first(y)
+end
+
+function findnext(
+    pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
+    s::Union{String, SubString{String}},
+    i::Integer,
+)
+    # TODO: Redesign these strange rules for errors, see #54584
+    scu = ncodeunits(s)
+    i == scu + 1 && return nothing
+    @boundscheck if i < 1 || i > scu + 1
+        throw(BoundsError(s, i))
+    end
+    # The most common case is probably searching for an ASCII char.
+    # We inline this critical path here to avoid instantiating a
+    # FwCharPosIter in the common case.
+    c = Char(pred.x)::Char
+    u = (reinterpret(UInt32, c) >> 24) % UInt8
+    i = Int(i)::Int
+    isvalid(s, i) || string_index_err(s, i)
+    return if is_standalone_byte(u)
+        _search(s, u, i)
+    else
+        try_next(FwCharPosIter(s, c, last_utf8_byte(c)), i)
+    end
 end
 
 function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
+    @boundscheck i < firstindex(a) && throw(BoundsError(a, i))
+    i > lastindex(a) && return nothing
     _search(a, pred.x, i)
 end
 
 function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
+    @boundscheck i < firstindex(a) && throw(BoundsError(a, i))
+    i > lastindex(a) && return nothing
     _search(a, pred.x, i)
 end
 
 # iszero is special, in that the bitpattern for zero for Int8 and UInt8 is the same,
 # so we can use memchr even if we search for an Int8 in an UInt8 array or vice versa
-findfirst(::typeof(iszero), a::DenseUInt8OrInt8) = _search(a, zero(UInt8))
-findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _search(a, zero(UInt8), i)
+function findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer)
+    @boundscheck i < firstindex(a) && throw(BoundsError(a, i))
+    i > lastindex(a) && return nothing
+    _search(a, zero(UInt8), i)
+end
 
+# This is essentially just a wrapper around memchr. i must be inbounds.
 function _search(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = firstindex(a))
     fst = firstindex(a)
-    lst = last_byteindex(a)
-    if i < fst
-        throw(BoundsError(a, i))
-    end
-    n_bytes = lst - i + 1
-    if i > lst
-        return i == lst+1 ? nothing : throw(BoundsError(a, i))
-    end
     GC.@preserve a begin
         p = pointer(a)
-        q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, n_bytes)
+        q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, last_byteindex(a) - i + 1)
     end
     return q == C_NULL ? nothing : (q-p+fst) % Int
 end
 
-function _search(a::DenseUInt8, b::AbstractChar, i::Integer = firstindex(a))
-    if isascii(b)
-        _search(a,UInt8(b),i)
-    else
-        _search(a,codeunits(string(b)),i).start
+function findprev(
+    pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
+    s::Union{String, SubString{String}},
+    i::Integer,
+)
+    # TODO: Redesign these strange rules for errors, see #54584
+    if i == ncodeunits(s) + 1 || i == 0
+        return nothing
     end
-end
-
-function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
-                  s::Union{String, SubString{String}}, i::Integer)
-    c = pred.x
-    c ≤ '\x7f' && return _rsearch(s, first_utf8_byte(c), i)
-    b = first_utf8_byte(c)
-    while true
-        i = _rsearch(s, b, i)
-        i == nothing && return nothing
-        isvalid(s, i) && pred(s[i]) && return i
-        i = prevind(s, i)
+    @boundscheck if i < 1 || i > ncodeunits(s) + 1
+        throw(BoundsError(s, i))
+    end
+    # Manually inline the fast path if c is ASCII, as we expect it to often be
+    c = Char(pred.x)::Char
+    u = (reinterpret(UInt32, c) >> 24) % UInt8
+    i = Int(i)::Int
+    return if is_standalone_byte(u)
+        _rsearch(s, u, i)
+    else
+        try_next(RvCharPosIter(s, c, last_utf8_byte(c)), i)
     end
-end
-
-function findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::DenseUInt8OrInt8)
-    findprev(pred, a, lastindex(a))
 end
 
 function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
+    @boundscheck i > lastindex(a) && throw(BoundsError(a, i))
+    i < firstindex(a) && return nothing
     _rsearch(a, pred.x, i)
 end
 
 function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
+    @boundscheck i > lastindex(a) && throw(BoundsError(a, i))
+    i < firstindex(a) && return nothing
     _rsearch(a, pred.x, i)
 end
 
 # See comments above for findfirst(::typeof(iszero)) methods
-findlast(::typeof(iszero), a::DenseUInt8OrInt8) = _rsearch(a, zero(UInt8))
-findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _rsearch(a, zero(UInt8), i)
+function findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer)
+    @boundscheck i > lastindex(a) && throw(BoundsError(a, i))
+    i < firstindex(a) && return nothing
+    _rsearch(a, zero(UInt8), i)
+end
 
+# This is essentially just a wrapper around memrchr. i must be inbounds.
 function _rsearch(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = last_byteindex(a))
     fst = firstindex(a)
-    lst = last_byteindex(a)
-    if i < fst
-        return i == fst - 1 ? nothing : throw(BoundsError(a, i))
-    end
-    if i > lst
-        return i == lst+1 ? nothing : throw(BoundsError(a, i))
-    end
     GC.@preserve a begin
         p = pointer(a)
         q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i-fst+1)
     end
     return q == C_NULL ? nothing : (q-p+fst) % Int
 end
 
-function _rsearch(a::DenseUInt8, b::AbstractChar, i::Integer = length(a))
-    if isascii(b)
-        _rsearch(a,UInt8(b),i)
-    else
-        _rsearch(a,codeunits(string(b)),i).start
-    end
-end
-
 function findall(
     pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
-    s::Union{String, SubString{String}}
+    s::Union{String, SubString{String}},
 )
-    c = Char(pred.x)::Char
-    byte = last_utf8_byte(c)
-    ncu = ncodeunits(c)
-
-    # If only one byte, and can't be part of another Char: Forward to memchr.
-    is_standalone_byte(byte) && return findall(==(byte), codeunits(s))
-    result = Int[]
-    i = firstindex(s)
-    while true
-        i = _search(s, byte, i)
-        isnothing(i) && return result
-        i += 1
-        index = i - ncu
-        # If the char is invalid, it's possible that its first byte is
-        # inside another char. If so, indexing into the string will throw an
-        # error, so we need to check for valid indices.
-        isvalid(s, index) || continue
-        # We use iterate here instead of indexing, because indexing wastefully
-        # checks for valid index. It would be better if there was something like
-        # try_getindex(::String, ::Int) we could use.
-        char = first(something(iterate(s, index)))
-        pred(char) && push!(result, index)
+    iter = FwCharPosIter(s, pred.x)
+    return if is_standalone_byte(iter.last_char_byte)
+        findall(==(iter.last_char_byte), codeunits(s))
+    else
+        # It is slightly wasteful that every iteration will check is_standalone_byte
+        # again, but this should only be minor overhead in the non-fast path.
+        collect(iter)
     end
 end
 
@@ -255,7 +334,6 @@ function findnext(testf::Function, s::AbstractString, i::Integer)
     return nothing
 end
 
-
 in(c::AbstractChar, s::AbstractString) = (findfirst(isequal(c),s)!==nothing)
 
 function _searchindex(s::Union{AbstractString,DenseUInt8OrInt8},
diff --git a/base/util.jl b/base/util.jl
@@ -508,7 +508,7 @@ unsafe_crc32c(a, n, crc) = ccall(:jl_crc32c, UInt32, (UInt32, Ptr{UInt8}, Csize_
 _crc32c(a::NTuple{<:Any, UInt8}, crc::UInt32=0x00000000) =
     unsafe_crc32c(Ref(a), length(a) % Csize_t, crc)
 
-function _crc32c(a::DenseBytes, crc::UInt32=0x00000000)
+function _crc32c(a::DenseUInt8OrInt8, crc::UInt32=0x00000000)
     unsafe_crc32c(a, length(a) % Csize_t, crc)
 end
 
diff --git a/stdlib/CRC32c/src/CRC32c.jl b/stdlib/CRC32c/src/CRC32c.jl
@@ -7,7 +7,8 @@ See [`CRC32c.crc32c`](@ref) for more information.
 """
 module CRC32c
 
-import Base: DenseBytes
+import Base.FastContiguousSubArray
+import Base: DenseUInt8OrInt8
 
 export crc32c
 
@@ -50,7 +51,7 @@ function crc32c(a::AbstractVector{UInt8}, crc::UInt32=0x00000000)
     return crc
 end
 
-function crc32c(a::DenseBytes, crc::UInt32=0x00000000)
+function crc32c(a::DenseUInt8OrInt8, crc::UInt32=0x00000000)
     Base._crc32c(a, crc)
 end
 
diff --git a/test/strings/search.jl b/test/strings/search.jl