@@ -31,9 +31,6 @@ const DenseUInt8 = Union{
31
31
32
32
const DenseUInt8OrInt8 = Union{DenseUInt8, DenseInt8}
33
33
34
- last_byteindex (x:: Union{String, SubString{String}} ) = ncodeunits (x)
35
- last_byteindex (x:: DenseUInt8OrInt8 ) = lastindex (x)
36
-
37
34
function last_utf8_byte (c:: Char )
38
35
u = reinterpret (UInt32, c)
39
36
shift = ((4 - ncodeunits (c)) * 8 ) & 31
@@ -44,144 +41,226 @@ end
44
41
# This holds even in the presence of invalid UTF8
45
42
is_standalone_byte (x:: UInt8 ) = (x < 0x80 ) | (x > 0xf7 )
46
43
47
- function findnext (pred:: Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar} ,
48
- s:: Union{String, SubString{String}} , i:: Integer )
49
- if i < 1 || i > sizeof (s)
50
- i == sizeof (s) + 1 && return nothing
51
- throw (BoundsError (s, i))
44
+ last_byteindex (x:: Union{String, SubString{String}} ) = ncodeunits (x)
45
+ last_byteindex (x:: DenseUInt8OrInt8 ) = lastindex (x)
46
+
47
+ # Internal type - lazy iterator over positions of char in string
48
+ struct FwCharPosIter{S}
49
+ string:: S # S is assumed to be either String or SubString{String}
50
+ char:: Char
51
+ # Char searchers search for the last UTF8 byte, because this byte tends to
52
+ # have the most variety in real texts, so any individual value is rarer.
53
+ # This allows more work to be done in the fast path using memchr.
54
+ last_char_byte:: UInt8
55
+ end
56
+
57
+ function FwCharPosIter (s:: Union{String, SubString{String}} , c:: AbstractChar )
58
+ char = Char (c):: Char
59
+ byte = last_utf8_byte (char)
60
+ FwCharPosIter {typeof(s)} (s, char, byte)
61
+ end
62
+
63
+ # i is the index in the string to search from.
64
+ # We assume it's never < firstindex(s.string)
65
+ function Base. iterate (s:: FwCharPosIter , i:: Int = 1 )
66
+ scu = ncodeunits (s. string)
67
+
68
+ # By definition, if the last byte is a standalone byte, then the char
69
+ # is a single-byte char where the byte can never be a subset of another char.
70
+ # Hence, we can simply search for the occurrence of the byte itself.
71
+ if is_standalone_byte (s. last_char_byte)
72
+ i > scu && return nothing
73
+ i = _search (s. string, s. last_char_byte, i)
74
+ i === nothing ? nothing : (i, i + 1 )
75
+ else
76
+ ncu = ncodeunits (s. char)
77
+ while true
78
+ i > scu && return nothing
79
+ i = _search (s. string, s. last_char_byte, i)
80
+ i === nothing && return nothing
81
+ # Increment i before the continue to avoid infinite loop.
82
+ # Since we search for the last byte in the char, the index has an offset.
83
+ i += 1
84
+ index = i - ncu
85
+ # The byte may be part of a different char, in which case index
86
+ # may be invalid.
87
+ isvalid (s. string, index) || continue
88
+ # Here, we use iterate instead of indexing, because indexing needlessly
89
+ # re-validates the index which we have already done here.
90
+ # This relies on the implementation detail that the iterator state for
91
+ # iterating strings is the same as the byte index.
92
+ char = first (something (iterate (s. string, index)))
93
+ char == s. char && return (index, i)
94
+ end
52
95
end
53
- @inbounds isvalid (s, i) || string_index_err (s, i)
54
- c = pred. x
55
- c ≤ ' \x 7f' && return _search (s, first_utf8_byte (c), i)
56
- while true
57
- i = _search (s, first_utf8_byte (c), i)
58
- i === nothing && return nothing
59
- isvalid (s, i) && pred (s[i]) && return i
60
- i = nextind (s, i)
96
+ end
97
+
98
+ # Internal type - lazy iterator over positions of char in string, in reverse order
99
+ struct RvCharPosIter{S}
100
+ string:: S # S is assumed to be either String or SubString{String}
101
+ char:: Char
102
+ last_char_byte:: UInt8
103
+ end
104
+
105
+ IteratorSize (s:: Type{<:Union{FwCharPosIter, RvCharPosIter}} ) = SizeUnknown ()
106
+ eltype (:: Type{<:Union{FwCharPosIter, RvCharPosIter}} ) = Int
107
+
108
+ function RvCharPosIter (s:: Union{String, SubString{String}} , c:: AbstractChar )
109
+ char = Char (c):: Char
110
+ byte = last_utf8_byte (char)
111
+ RvCharPosIter {typeof(s)} (s, char, byte)
112
+ end
113
+
114
+ # i is the index in the string to search from
115
+ # We assume it's never > ncodeunits(s.string)
116
+ # This is the same implementation as FwCharPosIter, except for two differences:
117
+ # 1. i must be decremented, not incremented because we are searching backwards
118
+ # 2. Because we search for the last byte, the starting value of i need to be
119
+ # incremented in the beginning, as that byte may be found at i + ncodeunits(char) - 1.
120
+ function Base. iterate (s:: RvCharPosIter , i:: Int = ncodeunits (s. string))
121
+ ncu = ncodeunits (s. char)
122
+ if is_standalone_byte (s. last_char_byte)
123
+ i < ncu && return nothing
124
+ i = _rsearch (s. string, s. last_char_byte, i)
125
+ i === nothing ? nothing : (i, i - 1 )
126
+ else
127
+ i = min (ncodeunits (s. string), i + ncu - 1 )
128
+ while true
129
+ i < ncu && return nothing
130
+ i = _rsearch (s. string, s. last_char_byte, i)
131
+ i === nothing && return nothing
132
+ index = i - ncu + 1
133
+ i -= 1
134
+ isvalid (s. string, index) || continue
135
+ char = first (something (iterate (s. string, index)))
136
+ char == s. char && return (index, i)
137
+ end
61
138
end
62
139
end
63
140
64
- function findfirst (pred:: Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{UInt8, Int8}} , a:: Union{DenseInt8, DenseUInt8} )
65
- findnext (pred, a, firstindex (a))
141
+ function try_next (x, state)
142
+ y = iterate (x, state)
143
+ y === nothing ? nothing : first (y)
144
+ end
145
+
146
+ function findnext (
147
+ pred:: Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar} ,
148
+ s:: Union{String, SubString{String}} ,
149
+ i:: Integer ,
150
+ )
151
+ # TODO : Redesign these strange rules for errors, see #54584
152
+ scu = ncodeunits (s)
153
+ i == scu + 1 && return nothing
154
+ @boundscheck if i < 1 || i > scu + 1
155
+ throw (BoundsError (s, i))
156
+ end
157
+ # The most common case is probably searching for an ASCII char.
158
+ # We inline this critical path here to avoid instantiating a
159
+ # FwCharPosIter in the common case.
160
+ c = Char (pred. x):: Char
161
+ u = (reinterpret (UInt32, c) >> 24 ) % UInt8
162
+ i = Int (i):: Int
163
+ isvalid (s, i) || string_index_err (s, i)
164
+ return if is_standalone_byte (u)
165
+ _search (s, u, i)
166
+ else
167
+ try_next (FwCharPosIter (s, c, last_utf8_byte (c)), i)
168
+ end
66
169
end
67
170
68
171
function findnext (pred:: Fix2{<:Union{typeof(isequal),typeof(==)},UInt8} , a:: DenseUInt8 , i:: Integer )
172
+ @boundscheck i < firstindex (a) && throw (BoundsError (a, i))
173
+ i > lastindex (a) && return nothing
69
174
_search (a, pred. x, i)
70
175
end
71
176
72
177
function findnext (pred:: Fix2{<:Union{typeof(isequal),typeof(==)},Int8} , a:: DenseInt8 , i:: Integer )
178
+ @boundscheck i < firstindex (a) && throw (BoundsError (a, i))
179
+ i > lastindex (a) && return nothing
73
180
_search (a, pred. x, i)
74
181
end
75
182
76
183
# iszero is special, in that the bitpattern for zero for Int8 and UInt8 is the same,
77
184
# so we can use memchr even if we search for an Int8 in an UInt8 array or vice versa
78
- findfirst (:: typeof (iszero), a:: DenseUInt8OrInt8 ) = _search (a, zero (UInt8))
79
- findnext (:: typeof (iszero), a:: DenseUInt8OrInt8 , i:: Integer ) = _search (a, zero (UInt8), i)
185
+ function findnext (:: typeof (iszero), a:: DenseUInt8OrInt8 , i:: Integer )
186
+ @boundscheck i < firstindex (a) && throw (BoundsError (a, i))
187
+ i > lastindex (a) && return nothing
188
+ _search (a, zero (UInt8), i)
189
+ end
80
190
191
+ # This is essentially just a wrapper around memchr. i must be inbounds.
81
192
function _search (a:: Union{String,SubString{String},DenseUInt8OrInt8} , b:: Union{Int8,UInt8} , i:: Integer = firstindex (a))
82
193
fst = firstindex (a)
83
- lst = last_byteindex (a)
84
- if i < fst
85
- throw (BoundsError (a, i))
86
- end
87
- n_bytes = lst - i + 1
88
- if i > lst
89
- return i == lst+ 1 ? nothing : throw (BoundsError (a, i))
90
- end
91
194
GC. @preserve a begin
92
195
p = pointer (a)
93
- q = ccall (:memchr , Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+ i- fst, b, n_bytes )
196
+ q = ccall (:memchr , Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+ i- fst, b, last_byteindex (a) - i + 1 )
94
197
end
95
198
return q == C_NULL ? nothing : (q- p+ fst) % Int
96
199
end
97
200
98
- function _search (a:: DenseUInt8 , b:: AbstractChar , i:: Integer = firstindex (a))
99
- if isascii (b)
100
- _search (a,UInt8 (b),i)
101
- else
102
- _search (a,codeunits (string (b)),i). start
201
+ function findprev (
202
+ pred:: Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar} ,
203
+ s:: Union{String, SubString{String}} ,
204
+ i:: Integer ,
205
+ )
206
+ # TODO : Redesign these strange rules for errors, see #54584
207
+ if i == ncodeunits (s) + 1 || i == 0
208
+ return nothing
103
209
end
104
- end
105
-
106
- function findprev (pred:: Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar} ,
107
- s:: Union{String, SubString{String}} , i:: Integer )
108
- c = pred. x
109
- c ≤ ' \x 7f' && return _rsearch (s, first_utf8_byte (c), i)
110
- b = first_utf8_byte (c)
111
- while true
112
- i = _rsearch (s, b, i)
113
- i == nothing && return nothing
114
- isvalid (s, i) && pred (s[i]) && return i
115
- i = prevind (s, i)
210
+ @boundscheck if i < 1 || i > ncodeunits (s) + 1
211
+ throw (BoundsError (s, i))
212
+ end
213
+ # Manually inline the fast path if c is ASCII, as we expect it to often be
214
+ c = Char (pred. x):: Char
215
+ u = (reinterpret (UInt32, c) >> 24 ) % UInt8
216
+ i = Int (i):: Int
217
+ return if is_standalone_byte (u)
218
+ _rsearch (s, u, i)
219
+ else
220
+ try_next (RvCharPosIter (s, c, last_utf8_byte (c)), i)
116
221
end
117
- end
118
-
119
- function findlast (pred:: Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}} , a:: DenseUInt8OrInt8 )
120
- findprev (pred, a, lastindex (a))
121
222
end
122
223
123
224
function findprev (pred:: Fix2{<:Union{typeof(isequal),typeof(==)},Int8} , a:: DenseInt8 , i:: Integer )
225
+ @boundscheck i > lastindex (a) && throw (BoundsError (a, i))
226
+ i < firstindex (a) && return nothing
124
227
_rsearch (a, pred. x, i)
125
228
end
126
229
127
230
function findprev (pred:: Fix2{<:Union{typeof(isequal),typeof(==)},UInt8} , a:: DenseUInt8 , i:: Integer )
231
+ @boundscheck i > lastindex (a) && throw (BoundsError (a, i))
232
+ i < firstindex (a) && return nothing
128
233
_rsearch (a, pred. x, i)
129
234
end
130
235
131
236
# See comments above for findfirst(::typeof(iszero)) methods
132
- findlast (:: typeof (iszero), a:: DenseUInt8OrInt8 ) = _rsearch (a, zero (UInt8))
133
- findprev (:: typeof (iszero), a:: DenseUInt8OrInt8 , i:: Integer ) = _rsearch (a, zero (UInt8), i)
237
+ function findprev (:: typeof (iszero), a:: DenseUInt8OrInt8 , i:: Integer )
238
+ @boundscheck i > lastindex (a) && throw (BoundsError (a, i))
239
+ i < firstindex (a) && return nothing
240
+ _rsearch (a, zero (UInt8), i)
241
+ end
134
242
243
+ # This is essentially just a wrapper around memrchr. i must be inbounds.
135
244
function _rsearch (a:: Union{String,SubString{String},DenseUInt8OrInt8} , b:: Union{Int8,UInt8} , i:: Integer = last_byteindex (a))
136
245
fst = firstindex (a)
137
- lst = last_byteindex (a)
138
- if i < fst
139
- return i == fst - 1 ? nothing : throw (BoundsError (a, i))
140
- end
141
- if i > lst
142
- return i == lst+ 1 ? nothing : throw (BoundsError (a, i))
143
- end
144
246
GC. @preserve a begin
145
247
p = pointer (a)
146
248
q = ccall (:memrchr , Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i- fst+ 1 )
147
249
end
148
250
return q == C_NULL ? nothing : (q- p+ fst) % Int
149
251
end
150
252
151
- function _rsearch (a:: DenseUInt8 , b:: AbstractChar , i:: Integer = length (a))
152
- if isascii (b)
153
- _rsearch (a,UInt8 (b),i)
154
- else
155
- _rsearch (a,codeunits (string (b)),i). start
156
- end
157
- end
158
-
159
253
function findall (
160
254
pred:: Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar} ,
161
- s:: Union{String, SubString{String}}
255
+ s:: Union{String, SubString{String}} ,
162
256
)
163
- c = Char (pred. x):: Char
164
- byte = last_utf8_byte (c)
165
- ncu = ncodeunits (c)
166
-
167
- # If only one byte, and can't be part of another Char: Forward to memchr.
168
- is_standalone_byte (byte) && return findall (== (byte), codeunits (s))
169
- result = Int[]
170
- i = firstindex (s)
171
- while true
172
- i = _search (s, byte, i)
173
- isnothing (i) && return result
174
- i += 1
175
- index = i - ncu
176
- # If the char is invalid, it's possible that its first byte is
177
- # inside another char. If so, indexing into the string will throw an
178
- # error, so we need to check for valid indices.
179
- isvalid (s, index) || continue
180
- # We use iterate here instead of indexing, because indexing wastefully
181
- # checks for valid index. It would be better if there was something like
182
- # try_getindex(::String, ::Int) we could use.
183
- char = first (something (iterate (s, index)))
184
- pred (char) && push! (result, index)
257
+ iter = FwCharPosIter (s, pred. x)
258
+ return if is_standalone_byte (iter. last_char_byte)
259
+ findall (== (iter. last_char_byte), codeunits (s))
260
+ else
261
+ # It is slightly wasteful that every iteration will check is_standalone_byte
262
+ # again, but this should only be minor overhead in the non-fast path.
263
+ collect (iter)
185
264
end
186
265
end
187
266
@@ -255,7 +334,6 @@ function findnext(testf::Function, s::AbstractString, i::Integer)
255
334
return nothing
256
335
end
257
336
258
-
259
337
in (c:: AbstractChar , s:: AbstractString ) = (findfirst (isequal (c),s)!= = nothing )
260
338
261
339
function _searchindex (s:: Union{AbstractString,DenseUInt8OrInt8} ,
0 commit comments