Skip to content

Commit 835a76b

Browse files
committed
More efficient copy of IOBuffer
Only copy the used bytes, not unreachable bytes.
1 parent dcb3943 commit 835a76b

File tree

2 files changed

+76
-14
lines changed

2 files changed

+76
-14
lines changed

base/iobuffer.jl

+45-14
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,15 @@
2020
# uuuuuXXXXXXXXXXXXX---------------------
2121
# || | | | |
2222
# |1 ptr size | maxsize (≥ lastindex)
23-
# lastindex(data)
23+
# | lastindex(data)
2424
# offset (set to zero)
2525

2626
# * The underlying array is always 1-indexed
2727
# * The IOBuffer has full control (ownership) of the underlying array, only when
2828
# buffer.write == true.
29-
# * Data in 1:mark can be deleted, shifting the whole thing to the left
30-
# to make room for more data, without replacing or resizing data
29+
# * Data before the mark can be deleted, shifting the whole thing to the left
30+
# to make room for more data, without replacing or resizing data.
31+
# This can be done only if the buffer is not seekable
3132

3233
# Internal trait object used to access unsafe constructors.
3334
struct UnsafeMethod end
@@ -59,20 +60,23 @@ mutable struct GenericIOBuffer{T<:AbstractVector{UInt8}} <: IO
5960
# This value is always in 0 : lastindex(data)
6061
size::Int
6162

63+
# When the buffer is resized, or a new buffer allocated, this is the maximum size of the buffer.
64+
# A new GenericIOBuffer may be constructed with an existing data larger than `maxsize`.
65+
# When that happens, the buffer will not write to data in maxsize + 1 : lastindex(data).
6266
# This value is always in 0:typemax(Int).
63-
# We always have length(data) <= maxsize
6467
maxsize::Int
6568

6669
# Data is read/written from/to ptr, except in situations where append is true, in which case
6770
# data is still read from ptr, but written to size+1.
68-
# This value is always in 1 : size+1
71+
# This value is always in offset + 1 : size+1
6972
ptr::Int
7073

7174
# This is used when seeking. seek(io, 0) results in ptr == offset.
7275
# The offset is needed because, if a buffer is instantiated from a Vector with a non-zero
7376
# memory offset, the start of the vector, and thus the start of data, does not correspond
7477
# to the start of its underlying memory.
7578
# Once the offset is set to zero, it will never be set to nonzero.
79+
# This is always in 0:lastindex(data)
7680
offset::Int
7781

7882
# mark is the position (as given by `position`, i.e. io.ptr - io.offset - 1)
@@ -269,16 +273,43 @@ PipeBuffer(maxsize::Integer) = (x = PipeBuffer(StringMemory(maxsize), maxsize =
269273
_similar_data(b::GenericIOBuffer, len::Int) = similar(b.data, len)
270274
_similar_data(b::IOBuffer, len::Int) = StringMemory(len)
271275

272-
# TODO: Only copy the used data, not the whole buffer.
276+
# Note: Copying may change the value of the position (and mark) for un-seekable streams.
277+
# However, these values are not stable anyway due to compaction.
278+
273279
function copy(b::GenericIOBuffer)
274-
ret = typeof(b)(b.reinit ? _similar_data(b, 0) : b.writable ?
275-
copyto!(_similar_data(b, length(b.data)), b.data) : b.data,
276-
b.readable, b.writable, b.seekable, b.append, b.maxsize)
277-
ret.size = b.size
278-
ret.ptr = b.ptr
279-
ret.mark = b.mark
280-
ret.offset = b.offset
281-
return ret
280+
if b.reinit
281+
# If buffer is used up, allocate a new size-zero buffer
282+
# Reinit implies wriable, and that ptr, size, offset and mark are already the default values
283+
return typeof(b)(_similar_data(b, 0), b.readable, b.writable, b.seekable, b.append, b.maxsize)
284+
elseif b.writable
285+
# Else, we just copy the reachable bytes. If buffer is seekable, all bytes
286+
# after offset are reachable, since they can be seeked to
287+
used_span = if b.seekable
288+
b.offset + 1 : b.size
289+
else
290+
# Even non-seekable streams can be seeked using `reset`. Therefore, we need to
291+
# copy all data from mark if it's set and below ptr.
292+
(b.mark > -1 ? min(b.ptr, b.mark) : b.ptr) : b.size
293+
end
294+
len = length(used_span)
295+
data = copyto!(_similar_data(b, len), view(b.data, used_span))
296+
ret = typeof(b)(data, b.readable, b.writable, b.seekable, b.append, b.maxsize)
297+
ret.size = len
298+
ret.offset = 0
299+
ret.ptr = b.ptr - first(used_span) + 1
300+
ret.mark = b.mark < 0 ? -1 : (b.mark - first(used_span) + 1)
301+
return ret
302+
else
303+
# When the buffer is just readable, they can share the same data, so we just make
304+
# a shallow copy of the IOBuffer struct.
305+
# Use unsafe method because we want to allow b.maxsize to be larger than data, in case that
306+
# is the case for `b`.
307+
ret = typeof(b)(unsafe_method, b.data, b.readable, b.writable, b.seekable, b.append, b.maxsize)
308+
ret.offset = b.offset
309+
ret.ptr = b.ptr
310+
ret.mark = b.mark
311+
return ret
312+
end
282313
end
283314

284315
show(io::IO, b::GenericIOBuffer) = print(io, "IOBuffer(data=UInt8[...], ",

test/iobuffer.jl

+31
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,37 @@ end
107107
@test isreadable(buf2)
108108
@test !iswritable(buf2)
109109
@test read(buf2) == 0x04:0x0d
110+
111+
# Test copying a non-seekable stream
112+
buf = new_unseekable_buffer()
113+
write(buf, "abcdef")
114+
read(buf, UInt16)
115+
mark(buf)
116+
read(buf, UInt16)
117+
buf2 = copy(buf)
118+
@test read(buf2) == b"ef"
119+
reset(buf2)
120+
@test read(buf2) == b"cdef"
121+
122+
# Test copying seekable stream
123+
buf = IOBuffer()
124+
write(buf, "abcdef")
125+
seekstart(buf)
126+
read(buf)
127+
mark(buf)
128+
buf2 = copy(buf)
129+
@test reset(buf2) == 6
130+
seekstart(buf2)
131+
@test read(buf2) == b"abcdef"
132+
133+
# Test copying a taken buffer
134+
buf = IOBuffer()
135+
write(buf, "abcdef")
136+
take!(buf)
137+
buf2 = copy(buf)
138+
@test eof(buf2)
139+
seekstart(buf2)
140+
@test eof(buf2)
110141
end
111142

112143
@testset "copyuntil" begin

0 commit comments

Comments
 (0)