Skip to content

Commit 00aff58

Browse files
committed
Reinstate offset field
Also test the behaviour that depends on this field
1 parent 17bd8e5 commit 00aff58

File tree

2 files changed

+146
-45
lines changed

2 files changed

+146
-45
lines changed

base/iobuffer.jl

+97-45
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,26 @@
44

55
# Here, u represents used bytes (already read), X represents bytes still to read,
66
# - represents bytes uninitialized data but which can be written to later.
7+
# . represents bytes before offset, which the buffer will not touch, until
8+
# a write operation happens.
79

8-
# uuuuuuuuuuuuuXXXXXXXXXXXXX------------
9-
# | | | | | |
10-
# | | ptr size | maxsize (≥ lastindex)
11-
# 1 mark (zero-indexed) lastindex(data)
10+
# .....uuuuuuuuuuuuuXXXXXXXXXXXXX------------
11+
# | | | | | | |
12+
# | offset | ptr size | maxsize (≥ lastindex)
13+
# 1 mark (zero-indexed) lastindex(data)
1214

1315
# AFTER COMPACTION
14-
# Mark, ptr and size decreases by `mark`
16+
# Mark, ptr and size decreases by `mark`. Offset is zeroed.
1517

1618
# uuuuuXXXXXXXXXXXXX---------------------
1719
# || | | | |
1820
# |1 ptr size | maxsize (≥ lastindex)
1921
# mark (zero-indexed) lastindex(data)
22+
# offset (set to zero)
2023

2124
# * The underlying array is always 1-indexed
22-
# * The IOBuffer has full control (ownership) of the underlying array.
25+
# * The IOBuffer has full control (ownership) of the underlying array, only when
26+
# buffer.write == true.
2327
# * Data in 1:mark can be deleted, shifting the whole thing to the left
2428
# to make room for more data, without replacing or resizing data
2529

@@ -33,13 +37,16 @@ mutable struct GenericIOBuffer{T<:AbstractVector{UInt8}} <: IO
3337

3438
# The user can take control of `data` out of this struct. When that happens, instead of eagerly allocating
3539
# a new array, we set `.reinit` to true, and then allocate a new one when needed.
40+
# If reinit is true, the buffer is writable, and offset and size is zero. See `take!`
3641
reinit::Bool
3742
readable::Bool
3843
writable::Bool
3944

4045
# If not seekable, implementation is free to destroy (compact) data in 1:mark-1.
4146
# If it IS seekable, the user may always recover any data in 1:size by seeking,
42-
# so no data can be destroyed
47+
# so no data can be destroyed.
48+
# Non-seekable IOBuffers can only be constructed with `PipeBuffer`, which are writable,
49+
# readable and append.
4350
seekable::Bool
4451

4552
# If true, write new data to the index size+1 instead of the index ptr.
@@ -60,10 +67,18 @@ mutable struct GenericIOBuffer{T<:AbstractVector{UInt8}} <: IO
6067
# This value is always in 1 : size+1
6168
ptr::Int
6269

63-
# Data at the marked location or before for non-seekable buffers can be deleted.
64-
# The mark is zero-indexed. If it is -1, the mark is not set.
65-
# The purpose of the mark is to reset the stream to a given position using reset.
66-
# This value is always in -1 : size-1
70+
# This is used when seeking. seek(io, 0) results in ptr == offset.
71+
# The offset is needed because, if a buffer is instantiated from a Vector with a non-zero
72+
# memory offset, the start of the vector, and thus the start of data, does not correspond
73+
# to the start of its underlying memory.
74+
# Once the offset is set to zero, it will never be set to nonzero.
75+
offset::Int
76+
77+
# mark is the position (as given by `position`, i.e. io.ptr - io.offset - 1)
78+
# which can be seeked back using `reset`, even for non-seekable buffers.
79+
# For non-seekable buffers that can be compacted, data before the mark can be
80+
# destroyed.
81+
# This value is always in -1 : size-offset-1
6782
mark::Int
6883

6984
# Unsafe constructor which does not do any checking
@@ -77,7 +92,7 @@ mutable struct GenericIOBuffer{T<:AbstractVector{UInt8}} <: IO
7792
maxsize::Int,
7893
) where T<:AbstractVector{UInt8}
7994
len = Int(length(data))::Int
80-
return new(data, false, readable, writable, seekable, append, len, maxsize, 1, -1)
95+
return new(data, false, readable, writable, seekable, append, len, maxsize, 1, 0, -1)
8196
end
8297
end
8398

@@ -113,11 +128,10 @@ function GenericIOBuffer(data::Vector{UInt8}, readable::Bool, writable::Bool, se
113128
mem = ref.mem
114129
len = length(data)
115130
offset = memoryrefoffset(ref) - 1
116-
if !iszero(offset)
117-
unsafe_copyto!(mem, 1, mem, offset+1, len)
118-
end
119131
buf = GenericIOBuffer(mem, readable, writable, seekable, append, maxsize)
120-
buf.size = len
132+
buf.size = len + offset
133+
buf.ptr = offset + 1
134+
buf.offset = offset
121135
return buf
122136
end
123137

@@ -199,7 +213,7 @@ function IOBuffer(
199213
flags = open_flags(read=read, write=write, append=append, truncate=truncate)
200214
buf = GenericIOBuffer(data, flags.read, flags.write, true, flags.append, maxsize)
201215
if flags.truncate
202-
buf.size = 0
216+
buf.size = buf.offset
203217
end
204218
return buf
205219
end
@@ -225,9 +239,11 @@ function IOBuffer(;
225239
flags = open_flags(read=read, write=write, append=append, truncate=truncate)
226240
# A common usecase of IOBuffer is to incrementally construct strings. By using StringMemory
227241
# as the default storage, we can turn the result into a string without copying.
228-
buf = GenericIOBuffer{Memory{UInt8}}(unsafe_method, StringMemory(size), flags.read, flags.write, true, flags.append, mz)
242+
# TODO: Do we need to zero this here?
243+
data = fill!(StringMemory(size), 0)
244+
buf = GenericIOBuffer{Memory{UInt8}}(unsafe_method, data, flags.read, flags.write, true, flags.append, mz)
229245
if flags.truncate
230-
buf.size = 0
246+
buf.size = buf.offset
231247
end
232248
return buf
233249
end
@@ -258,6 +274,7 @@ function copy(b::GenericIOBuffer)
258274
ret.size = b.size
259275
ret.ptr = b.ptr
260276
ret.mark = b.mark
277+
ret.offset = b.offset
261278
return ret
262279
end
263280

@@ -266,9 +283,9 @@ show(io::IO, b::GenericIOBuffer) = print(io, "IOBuffer(data=UInt8[...], ",
266283
"writable=", b.writable, ", ",
267284
"seekable=", b.seekable, ", ",
268285
"append=", b.append, ", ",
269-
"size=", b.size, ", ",
286+
"size=", b.size - b.offset, ", ",
270287
"maxsize=", b.maxsize == typemax(Int) ? "Inf" : b.maxsize, ", ",
271-
"ptr=", b.ptr, ", ",
288+
"ptr=", b.ptr - b.offset, ", ",
272289
"mark=", b.mark, ")")
273290

274291
@noinline function _throw_not_readable()
@@ -402,15 +419,15 @@ isreadable(io::GenericIOBuffer) = io.readable
402419
iswritable(io::GenericIOBuffer) = io.writable
403420

404421
# Number of bytes that can be read from the buffer, if you seek to the start first.
405-
filesize(io::GenericIOBuffer) = (io.seekable ? io.size : bytesavailable(io))
422+
filesize(io::GenericIOBuffer) = (io.seekable ? io.size - io.offset : bytesavailable(io))
406423

407424
# Number of bytes that can be read from the buffer.
408425
bytesavailable(io::GenericIOBuffer) = io.size - io.ptr + 1
409426

410427
# Position is zero-indexed, but ptr is one-indexed, hence the -1
411428
# TODO: Document that position for an unseekable stream is invalid, or
412429
# make it error
413-
position(io::GenericIOBuffer) = io.ptr - 1
430+
position(io::GenericIOBuffer) = io.ptr - io.offset - 1
414431

415432
function skip(io::GenericIOBuffer, n::Integer)
416433
skip(io, clamp(n, Int))
@@ -426,8 +443,8 @@ function skip(io::GenericIOBuffer, n::Int)
426443
seek(io, seekto) # Does error checking
427444
else
428445
# Don't use seek in order to allow a non-seekable IO to still skip bytes.
429-
n_max = io.size + 1 - io.ptr
430-
io.ptr += min(n, n_max)
446+
# Handle overflow
447+
io.ptr = min(io.size + 1, clamp(widen(io.ptr) + widen(n), Int))
431448
io
432449
end
433450
end
@@ -445,7 +462,7 @@ function seek(io::GenericIOBuffer, n::Int)
445462
# of an GenericIOBuffer), so that would need to be fixed in order to throw an error here
446463
#(n < 0 || n > io.size - io.offset) && throw(ArgumentError("Attempted to seek outside IOBuffer boundaries."))
447464
#io.ptr = n + io.offset + 1
448-
io.ptr = clamp(n, 0, io.size) + 1
465+
io.ptr = clamp(n, 0, io.size - io.offset) + io.offset + 1
449466
return io
450467
end
451468

@@ -457,8 +474,11 @@ end
457474

458475
# Resize data to exactly size `sz`. Either resize the underlying data,
459476
# or allocate a new one and copy.
477+
# This should only be called after the offset is zero - any operation which calls
478+
# _resize! should reset offset before so.
460479
function _resize!(io::GenericIOBuffer, new_size::Int)
461480
old_data = io.data
481+
@assert iszero(io.offset)
462482
if applicable(resize!, old_data, new_size)
463483
resize!(old_data, new_size)
464484
else
@@ -475,22 +495,30 @@ function _resize!(io::GenericIOBuffer, new_size::Int)
475495
return io
476496
end
477497

478-
function truncate(io::GenericIOBuffer, n::Integer)
498+
# TODO: These errors cannot be converted to LazyString, but it's wasteful to interpolate them here.
499+
function truncate(io::GenericIOBuffer, n::Integer)
479500
io.writable || throw(ArgumentError("truncate failed, IOBuffer is not writeable"))
501+
# Non-seekable buffers can only be constructed with `PipeBuffer`, which is explicitly
502+
# documented to not be truncatable.
480503
io.seekable || throw(ArgumentError("truncate failed, IOBuffer is not seekable"))
481504
n < 0 && throw(ArgumentError("truncate failed, n bytes must be ≥ 0, got $n"))
482505
n > io.maxsize && throw(ArgumentError("truncate failed, $(n) bytes is exceeds IOBuffer maxsize $(io.maxsize)"))
483-
n = Int(n)
506+
n = Int(n)::Int
484507
if io.reinit
508+
@assert iszero(io.offset)
485509
io.data = _similar_data(io, n)
486510
io.reinit = false
487-
elseif n > length(io.data)
488-
_resize!(io, n)
489-
end
490-
ismarked(io) && io.mark > n && unmark(io)
491-
io.data[io.size+1:n] .= 0
492-
io.size = n
493-
io.ptr = min(io.ptr, n+1)
511+
elseif n > length(io.data) - io.offset
512+
# We zero the offset here because that allows us to minimize the resizing,
513+
# saving memory.
514+
zero_offset!(io)
515+
n > length(io.data) && _resize!(io, n)
516+
end
517+
# Since mark is zero-indexed, we must also clear it if they're equal
518+
ismarked(io) && io.mark >= n && (io.mark = -1)
519+
io.data[io.size+1:n+io.offset] .= 0
520+
io.size = n + io.offset
521+
io.ptr = min(io.ptr, n+io.offset+1)
494522
return io
495523
end
496524

@@ -520,12 +548,18 @@ end
520548
io.writable || throw(ArgumentError("ensureroom failed, IOBuffer is not writeable"))
521549
io.data = _similar_data(io, min(io.maxsize, nshort % Int))
522550
io.reinit = false
551+
io.offset = 0
523552
return io
524553
end
525554

526555
@noinline function ensureroom_slowpath(io::GenericIOBuffer, nshort::UInt)
527-
# If the buffer is seekable, the user can seek to before ptr, and so we
528-
# cannot compact the data.
556+
# Begin by zeroing out offset and check if that gives us room enough
557+
nshort -= zero_offset!(io) % UInt
558+
iszero(nshort) && return io
559+
560+
# Else, try to compact the data. To do this, the buffer must not be seekable.
561+
# If it's seekable, the user can recover used data by seeking before ptr,
562+
# and so we can't delete it.
529563
if (!io.seekable && io.ptr > 1)
530564
ptr = io.ptr
531565
mark = io.mark
@@ -556,6 +590,21 @@ end
556590
return io
557591
end
558592

593+
function zero_offset!(io::GenericIOBuffer)::Int
594+
@assert io.writable
595+
offset = io.offset
596+
iszero(offset) && return 0
597+
size = io.size
598+
if size != offset
599+
data = io.data
600+
unsafe_copyto!(data, 1, data, offset + 1, size - offset)
601+
end
602+
io.offset = 0
603+
io.ptr -= offset
604+
io.size -= offset
605+
return offset
606+
end
607+
559608
eof(io::GenericIOBuffer) = (io.ptr - 1 >= io.size)
560609

561610
function closewrite(io::GenericIOBuffer)
@@ -571,6 +620,7 @@ end
571620
io.maxsize = 0
572621
io.ptr = 1
573622
io.mark = -1
623+
io.offset = 0
574624
if io.writable && !io.reinit
575625
io.data = _resize!(io, 0)
576626
end
@@ -601,8 +651,8 @@ function take!(io::GenericIOBuffer)
601651
# If the buffer is seekable, then the previously consumed bytes from ptr+1:size
602652
# must still be output, as they are not truly gone.
603653
# Hence, we output all bytes from 1:io.size
604-
nbytes = io.size
605-
data = copyto!(StringVector(nbytes), 1, io.data, 1, nbytes)
654+
nbytes = io.size - io.offset
655+
data = copyto!(StringVector(nbytes), 1, io.data, io.offset + 1, nbytes)
606656
else
607657
# Else, if not seekable, bytes from 1:ptr-1 are truly gone and should not
608658
# be output. Hence, we output `bytesavailable`, which is ptr:size
@@ -613,6 +663,7 @@ function take!(io::GenericIOBuffer)
613663
io.reinit = true
614664
io.ptr = 1
615665
io.size = 0
666+
io.offset = 0
616667
end
617668
return data
618669
end
@@ -627,9 +678,9 @@ function take!(io::IOBuffer)
627678
if nbytes == 0 || io.reinit
628679
data = StringVector(0)
629680
elseif io.writable
630-
data = wrap(Array, memoryref(io.data, 1), nbytes)
681+
data = wrap(Array, memoryref(io.data, io.offset + 1), nbytes)
631682
else
632-
data = copyto!(StringVector(nbytes), 1, io.data, 1, nbytes)
683+
data = copyto!(StringVector(nbytes), 1, io.data, io.offset + 1, nbytes)
633684
end
634685
else
635686
nbytes = bytesavailable(io)
@@ -645,6 +696,7 @@ function take!(io::IOBuffer)
645696
io.reinit = true
646697
io.ptr = 1
647698
io.size = 0
699+
io.offset = 0
648700
end
649701
return data
650702
end
@@ -662,10 +714,10 @@ It might save an allocation compared to `take!` (if the compiler elides the
662714
Array allocation), as well as omits some checks.
663715
"""
664716
_unsafe_take!(io::IOBuffer) =
665-
wrap(Array, io.size == 0 ?
717+
wrap(Array, io.size == io.offset ?
666718
memoryref(Memory{UInt8}()) :
667-
memoryref(io.data, 1),
668-
io.size)
719+
memoryref(io.data, io.offset + 1),
720+
io.size - io.offset)
669721

670722
function write(to::IO, from::GenericIOBuffer)
671723
available = bytesavailable(from)
@@ -820,7 +872,7 @@ function copyline(out::GenericIOBuffer, s::IO; keep::Bool=false)
820872
copyuntil(out, s, 0x0a, keep=true)
821873
line = out.data
822874
i = out.size
823-
if keep || iszero(i) || line[i] != 0x0a
875+
if keep || i == out.offset || line[i] != 0x0a
824876
return out
825877
elseif i < 2 || line[i-1] != 0x0d
826878
i -= 1

0 commit comments

Comments
 (0)