Skip to content

Commit 34340d4

Browse files
committed
WIP: Comment IOBuffer code
1 parent f5ce249 commit 34340d4

File tree

1 file changed

+118
-10
lines changed

1 file changed

+118
-10
lines changed

base/iobuffer.jl

+118-10
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,75 @@
22

33
## work with AbstractVector{UInt8} via I/O primitives ##
44

5-
# Stateful string
5+
# Here, u represents used bytes (already read), X represents bytes still to read,
6+
# - represents bytes uninitialized data but which can be written to later,
7+
# and . represents content of `data` which the IOBuffer will not interact with.
8+
9+
# ...uuuuuuuuuuuuuXXXXXXXXXXXXX------.......
10+
# | | | | | | |
11+
# | offset | ptr size | |
12+
# 1 mark | lastindex(data)
13+
# maxsize
14+
15+
# AFTER COMPACTION
16+
# Mark, ptr and size decreases by (mark - offset - 1)
17+
18+
# ...uuuuuXXXXXXXXXXXXX--------------.......
19+
# | || | | | |
20+
# | offset| size | lastindex(data)
21+
# 1 | ptr maxsize
22+
# mark ( == offset + 1)
23+
24+
# * The underlying array is always 1-indexed
25+
# * Data in 1:offset is never touched. This allows an IOBuffer to use a subset of a larger array.
26+
# * Data in maxsize+1:lastindex(data) is also never touched.
27+
# * Data in offset+1:mark-1, if mark > 0 can be deleted, shifting the whole thing to the left
28+
# to make room for more data, without replacing or resizing data
29+
630
mutable struct GenericIOBuffer{T<:AbstractVector{UInt8}} <: IO
7-
data::T # T should support: getindex, setindex!, length, copyto!, similar, and (optionally) resize!
8-
reinit::Bool # if true, data needs to be re-allocated (after take!)
31+
# T should support: getindex, setindex!, length, copyto!, similar, and (optionally) resize!
32+
data::T
33+
34+
# The user can take control of `data` out of this struct. When that happens, instead of eagerly allocating
35+
# a new array, we set `.reinit` to true, and then allocate a new one when needed.
36+
reinit::Bool
937
readable::Bool
1038
writable::Bool
11-
seekable::Bool # if not seekable, implementation is free to destroy (compact) past read data
12-
append::Bool # add data at end instead of at pointer
13-
size::Int # end pointer (and write pointer if append == true) + offset
14-
maxsize::Int # fixed array size (typically pre-allocated)
15-
ptr::Int # read (and maybe write) pointer + offset
16-
offset::Int # offset of ptr and size from actual start of data and actual size
17-
mark::Int # reset mark location for ptr (or <0 for no mark)
39+
40+
# If not seekable, implementation is free to destroy (compact) data in offset+1:mark-1
41+
seekable::Bool
42+
43+
# If true, write new data to the index size+1 instead of the index ptr.
44+
append::Bool
45+
46+
# Last index of `data` that has been written to. Data in size+1:end has not yet been used,
47+
# and may contain arbitrary values.
48+
# This value is always in offset : min(maxsize, lastindex(data))
49+
size::Int
50+
51+
# Size can never be larger than this value. This is useful for two use cases:
52+
# 1. To prevent `data` from becoming too large, using too much memory
53+
# 2. To guarantee that data in maxsize+1:lastindex(data) is never touched
54+
# Note that this can be larger than lastindex(data), since data may be replaced.
55+
# This value is always in 0:typemax(Int)
56+
maxsize::Int
57+
58+
# Data is read/written from/to ptr, except in situations where append is true, in which case
59+
# data is still read from ptr, but written to size+1.
60+
# This value is alwaus in offset+1 : size+1
61+
ptr::Int
62+
63+
# When constructed from a Vector `v`, `data` is its underlying memory, but the memory may contain
64+
# data before v[1]. In that case, the offset is the number of leading bytes in the memory that
65+
# the IO may never touch.
66+
# This value is always in 0:lastindex(data)
67+
offset::Int
68+
69+
# Data before the marked location for non-seekable buffers can be compacted.
70+
# If this is -1, the mark is not set.
71+
# The purpose of the mark is to reset the stream to a given position using reset.
72+
# This value is always == -1, or in offset+1:size
73+
mark::Int
1874

1975
function GenericIOBuffer{T}(data::T, readable::Bool, writable::Bool, seekable::Bool, append::Bool,
2076
maxsize::Integer) where T<:AbstractVector{UInt8}
@@ -29,6 +85,9 @@ function GenericIOBuffer(data::T, readable::Bool, writable::Bool, seekable::Bool
2985
maxsize::Integer) where T<:AbstractVector{UInt8}
3086
GenericIOBuffer{T}(data, readable, writable, seekable, append, maxsize)
3187
end
88+
89+
# For this method, we use the underlying Memory of the vector. Therefore, we need to set the offset,
90+
# ptr and size accordingly, so the buffer only uses the part of the memory that the vector does.
3291
function GenericIOBuffer(data::Vector{UInt8}, readable::Bool, writable::Bool, seekable::Bool, append::Bool,
3392
maxsize::Integer)
3493
ref = data.ref
@@ -135,6 +194,8 @@ function IOBuffer(;
135194
size = sizehint !== nothing ? Int(sizehint) : maxsize != typemax(Int) ? Int(maxsize) : 32
136195
flags = open_flags(read=read, write=write, append=append, truncate=truncate)
137196
buf = IOBuffer(
197+
# A common usecase of IOBuffer is to incrementally construct strings. By using StringMemory
198+
# as the default storage, we can turn the result into a string without copying.
138199
StringMemory(size),
139200
read=flags.read,
140201
write=flags.write,
@@ -247,6 +308,7 @@ function read_sub(from::GenericIOBuffer, a::AbstractArray{T}, offs, nel) where T
247308
if offs+nel-1 > length(a) || offs < 1 || nel < 0
248309
throw(BoundsError())
249310
end
311+
# Use memcpy where applicable for performance
250312
if isa(a, MutableDenseArrayType{UInt8})
251313
nb = UInt(nel * sizeof(T))
252314
GC.@preserve a unsafe_read(from, pointer(a, offs), nb)
@@ -283,18 +345,29 @@ read(from::GenericIOBuffer, ::Type{Ptr{T}}) where {T} = convert(Ptr{T}, read(fro
283345
isreadable(io::GenericIOBuffer) = io.readable
284346
iswritable(io::GenericIOBuffer) = io.writable
285347

348+
# Number of bytes that can be read from the buffer, if you seek to the start first.
286349
filesize(io::GenericIOBuffer) = (io.seekable ? io.size - io.offset : bytesavailable(io))
350+
351+
# Number of bytes that can be read from the buffer.
287352
bytesavailable(io::GenericIOBuffer) = io.size - io.ptr + 1
353+
354+
# Position is zero-indexed, but ptr is one-indexed, hence the -1
288355
position(io::GenericIOBuffer) = io.ptr - io.offset - 1
289356

290357
function skip(io::GenericIOBuffer, n::Integer)
291358
skip(io, clamp(n, Int))
292359
end
360+
293361
function skip(io::GenericIOBuffer, n::Int)
362+
# In both cases, the result will never go to before the first position,
363+
# nor beyond the last position, and will not throw an error unless the stream
364+
# is not seekable and try to skip a negative number of bytes.
294365
if signbit(n)
366+
# Skipping a negative number of bytes is equivalent to seeking backwards.
295367
seekto = clamp(widen(position(io)) + widen(n), Int)
296368
seek(io, seekto) # Does error checking
297369
else
370+
# Don't use seek in order to allow a non-seekable IO to still skip bytes.
298371
n_max = io.size + 1 - io.ptr
299372
io.ptr += min(n, n_max)
300373
io
@@ -304,6 +377,7 @@ end
304377
function seek(io::GenericIOBuffer, n::Integer)
305378
seek(io, clamp(n, Int))
306379
end
380+
307381
function seek(io::GenericIOBuffer, n::Int)
308382
if !io.seekable
309383
ismarked(io) || throw(ArgumentError("seek failed, IOBuffer is not seekable and is not marked"))
@@ -329,6 +403,7 @@ function _resize!(io::GenericIOBuffer, sz::Int)
329403
a = io.data
330404
offset = io.offset
331405
if applicable(resize!, a, sz)
406+
# TODO: This is buggy: The buffer should never touch data before the offset.
332407
if offset != 0
333408
size = io.size
334409
size > offset && copyto!(a, 1, a, offset + 1, min(sz, size - offset))
@@ -339,6 +414,7 @@ function _resize!(io::GenericIOBuffer, sz::Int)
339414
resize!(a, sz)
340415
else
341416
size = io.size
417+
# Make a new data buffer, only if there is not room in existing buffer
342418
if size >= sz && sz != 0
343419
b = a
344420
else
@@ -373,9 +449,15 @@ function truncate(io::GenericIOBuffer, n::Integer)
373449
return io
374450
end
375451

452+
# Internal method. Delete used data in the buffer, shifting the rest to the left.
453+
# Does not delete data at or after the mark, nor data before the offset, nor data
454+
# after ptr.
376455
function compact(io::GenericIOBuffer)
456+
# For a seekable buffer, the user could always seek back to the used data.
457+
# Therefore, it is invalid to compact a seekable buffer
377458
io.writable || throw(ArgumentError("compact failed, IOBuffer is not writeable"))
378459
io.seekable && throw(ArgumentError("compact failed, IOBuffer is seekable"))
460+
# If the data is invalid and needs to be replaced, no point in compacting
379461
io.reinit && return
380462
local ptr::Int, bytes_to_move::Int
381463
if ismarked(io) && io.mark < position(io)
@@ -386,6 +468,7 @@ function compact(io::GenericIOBuffer)
386468
ptr = io.ptr
387469
bytes_to_move = bytesavailable(io)
388470
end
471+
# TODO: Invalid: Buffer must not touch data before offset
389472
copyto!(io.data, 1, io.data, ptr, bytes_to_move)
390473
io.size -= ptr - 1
391474
io.ptr -= ptr - 1
@@ -474,19 +557,30 @@ julia> String(take!(io))
474557
function take!(io::GenericIOBuffer)
475558
ismarked(io) && unmark(io)
476559
if io.seekable
560+
# If the buffer is seekable, then the previously consumed bytes from ptr+1:size
561+
# must still be output, as they are not truly gone.
562+
# Hence, we output all bytes from offset+1:io.size
477563
nbytes = io.size - io.offset
478564
data = copyto!(StringVector(nbytes), 1, io.data, io.offset + 1, nbytes)
479565
else
566+
# Else, if not seekable, bytes from offset+1:ptr-1 are truly gone and should not
567+
# be output. Hence, we output `bytesavailable`, which is ptr:size
480568
nbytes = bytesavailable(io)
481569
data = read!(io, StringVector(nbytes))
482570
end
571+
# TODO: Why do we not reinit here? The user has taken control of the buffer
572+
# so it's not safe to hold onto it.
483573
if io.writable
484574
io.ptr = 1
485575
io.size = 0
486576
io.offset = 0
487577
end
488578
return data
489579
end
580+
581+
# This method is specialized because we know the underlying data is a Memory, so we can
582+
# e.g. wrap directly in an array without copying. Otherwise the logic is the same as
583+
# the generic method
490584
function take!(io::IOBuffer)
491585
ismarked(io) && unmark(io)
492586
if io.seekable
@@ -552,6 +646,7 @@ function unsafe_write(to::GenericIOBuffer, p::Ptr{UInt8}, nb::UInt)
552646
written = Int(min(nb, Int(length(to.data))::Int - ptr + 1))
553647
towrite = written
554648
d = to.data
649+
# TODO: This inbounds is unsafe, since the underlying data may be of any type
555650
while towrite > 0
556651
@inbounds d[ptr] = unsafe_load(p)
557652
ptr += 1
@@ -565,6 +660,9 @@ function unsafe_write(to::GenericIOBuffer, p::Ptr{UInt8}, nb::UInt)
565660
return written
566661
end
567662

663+
# TODO: We should have a method that uses memcpy (`copyto!`) for the IOBuffer case.
664+
# Preliminary testing suggests this would be ~10x faster than the current implementation
665+
568666
@inline function write(to::GenericIOBuffer, a::UInt8)
569667
ensureroom(to, UInt(1))
570668
ptr = (to.append ? to.size+1 : to.ptr)
@@ -590,15 +688,21 @@ function readbytes!(io::GenericIOBuffer, b::MutableDenseArrayType{UInt8}, nb::In
590688
return nr
591689
end
592690
read(io::GenericIOBuffer) = read!(io, StringVector(bytesavailable(io)))
691+
692+
# For IO buffers, all the data is immediately available.
593693
readavailable(io::GenericIOBuffer) = read(io)
694+
594695
read(io::GenericIOBuffer, nb::Integer) = read!(io, StringVector(min(nb, bytesavailable(io))))
595696

596697
function occursin(delim::UInt8, buf::IOBuffer)
698+
# TODO: This pointer call is invalid. Also, perhaps this should use the default `in` method,
699+
# which already should be implemented using memchr.
597700
p = pointer(buf.data, buf.ptr)
598701
q = GC.@preserve buf ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, delim, bytesavailable(buf))
599702
return q != C_NULL
600703
end
601704

705+
# TODO: Invalid use of @inbounds, and also we can use the generic `in` method.
602706
function occursin(delim::UInt8, buf::GenericIOBuffer)
603707
data = buf.data
604708
for i = buf.ptr:buf.size
@@ -625,6 +729,8 @@ function copyline(out::GenericIOBuffer, s::IO; keep::Bool=false)
625729
copyuntil(out, s, 0x0a, keep=true)
626730
line = out.data
627731
i = out.size # XXX: this is only correct for appended data. if the data was inserted, only ptr should change
732+
# TODO: This computation seems to be invalid. The buffer size may be smaller than out.size
733+
# due to the presence of out.offset.
628734
if keep || i == out.offset || line[i] != 0x0a
629735
return out
630736
elseif i < 2 || line[i-1] != 0x0d
@@ -644,6 +750,7 @@ function _copyline(out::IO, io::GenericIOBuffer; keep::Bool=false)
644750
# note: findfirst + copyto! is much faster than a single loop
645751
# except for nout ≲ 20. A single loop is 2x faster for nout=5.
646752
nout = nread = something(findfirst(==(0x0a), data), length(data))
753+
# Remove the 0x0a (newline) if not keep, and also remove the 0x0d (\r) if it is there
647754
if !keep && nout > 0 && data[nout] == 0x0a
648755
nout -= 1
649756
nout > 0 && data[nout] == 0x0d && (nout -= 1)
@@ -652,6 +759,7 @@ function _copyline(out::IO, io::GenericIOBuffer; keep::Bool=false)
652759
io.ptr += nread
653760
return out
654761
end
762+
655763
copyline(out::IO, io::GenericIOBuffer; keep::Bool=false) = _copyline(out, io; keep)
656764
copyline(out::GenericIOBuffer, io::GenericIOBuffer; keep::Bool=false) = _copyline(out, io; keep)
657765

0 commit comments

Comments
 (0)