2
2
3
3
# # work with AbstractVector{UInt8} via I/O primitives ##
4
4
5
- # Stateful string
5
+ # Here, u represents used bytes (already read), X represents bytes still to read,
6
+ # - represents bytes uninitialized data but which can be written to later,
7
+ # and . represents content of `data` which the IOBuffer will not interact with.
8
+
9
+ # ...uuuuuuuuuuuuuXXXXXXXXXXXXX------.......
10
+ # | | | | | | |
11
+ # | offset | ptr size | |
12
+ # 1 mark | lastindex(data)
13
+ # maxsize
14
+
15
+ # AFTER COMPACTION
16
+ # Mark, ptr and size decreases by (mark - offset - 1)
17
+
18
+ # ...uuuuuXXXXXXXXXXXXX--------------.......
19
+ # | || | | | |
20
+ # | offset| size | lastindex(data)
21
+ # 1 | ptr maxsize
22
+ # mark ( == offset + 1)
23
+
24
+ # * The underlying array is always 1-indexed
25
+ # * Data in 1:offset is never touched. This allows an IOBuffer to use a subset of a larger array.
26
+ # * Data in maxsize+1:lastindex(data) is also never touched.
27
+ # * Data in offset+1:mark-1, if mark > 0 can be deleted, shifting the whole thing to the left
28
+ # to make room for more data, without replacing or resizing data
29
+
6
30
mutable struct GenericIOBuffer{T<: AbstractVector{UInt8} } <: IO
7
- data:: T # T should support: getindex, setindex!, length, copyto!, similar, and (optionally) resize!
8
- reinit:: Bool # if true, data needs to be re-allocated (after take!)
31
+ # T should support: getindex, setindex!, length, copyto!, similar, and (optionally) resize!
32
+ data:: T
33
+
34
+ # The user can take control of `data` out of this struct. When that happens, instead of eagerly allocating
35
+ # a new array, we set `.reinit` to true, and then allocate a new one when needed.
36
+ reinit:: Bool
9
37
readable:: Bool
10
38
writable:: Bool
11
- seekable:: Bool # if not seekable, implementation is free to destroy (compact) past read data
12
- append:: Bool # add data at end instead of at pointer
13
- size:: Int # end pointer (and write pointer if append == true) + offset
14
- maxsize:: Int # fixed array size (typically pre-allocated)
15
- ptr:: Int # read (and maybe write) pointer + offset
16
- offset:: Int # offset of ptr and size from actual start of data and actual size
17
- mark:: Int # reset mark location for ptr (or <0 for no mark)
39
+
40
+ # If not seekable, implementation is free to destroy (compact) data in offset+1:mark-1
41
+ seekable:: Bool
42
+
43
+ # If true, write new data to the index size+1 instead of the index ptr.
44
+ append:: Bool
45
+
46
+ # Last index of `data` that has been written to. Data in size+1:end has not yet been used,
47
+ # and may contain arbitrary values.
48
+ # This value is always in offset : min(maxsize, lastindex(data))
49
+ size:: Int
50
+
51
+ # Size can never be larger than this value. This is useful for two use cases:
52
+ # 1. To prevent `data` from becoming too large, using too much memory
53
+ # 2. To guarantee that data in maxsize+1:lastindex(data) is never touched
54
+ # Note that this can be larger than lastindex(data), since data may be replaced.
55
+ # This value is always in 0:typemax(Int)
56
+ maxsize:: Int
57
+
58
+ # Data is read/written from/to ptr, except in situations where append is true, in which case
59
+ # data is still read from ptr, but written to size+1.
60
+ # This value is alwaus in offset+1 : size+1
61
+ ptr:: Int
62
+
63
+ # When constructed from a Vector `v`, `data` is its underlying memory, but the memory may contain
64
+ # data before v[1]. In that case, the offset is the number of leading bytes in the memory that
65
+ # the IO may never touch.
66
+ # This value is always in 0:lastindex(data)
67
+ offset:: Int
68
+
69
+ # Data before the marked location for non-seekable buffers can be compacted.
70
+ # If this is -1, the mark is not set.
71
+ # The purpose of the mark is to reset the stream to a given position using reset.
72
+ # This value is always == -1, or in offset+1:size
73
+ mark:: Int
18
74
19
75
function GenericIOBuffer {T} (data:: T , readable:: Bool , writable:: Bool , seekable:: Bool , append:: Bool ,
20
76
maxsize:: Integer ) where T<: AbstractVector{UInt8}
@@ -29,6 +85,9 @@ function GenericIOBuffer(data::T, readable::Bool, writable::Bool, seekable::Bool
29
85
maxsize:: Integer ) where T<: AbstractVector{UInt8}
30
86
GenericIOBuffer {T} (data, readable, writable, seekable, append, maxsize)
31
87
end
88
+
89
+ # For this method, we use the underlying Memory of the vector. Therefore, we need to set the offset,
90
+ # ptr and size accordingly, so the buffer only uses the part of the memory that the vector does.
32
91
function GenericIOBuffer (data:: Vector{UInt8} , readable:: Bool , writable:: Bool , seekable:: Bool , append:: Bool ,
33
92
maxsize:: Integer )
34
93
ref = data. ref
@@ -135,6 +194,8 @@ function IOBuffer(;
135
194
size = sizehint != = nothing ? Int (sizehint) : maxsize != typemax (Int) ? Int (maxsize) : 32
136
195
flags = open_flags (read= read, write= write, append= append, truncate= truncate)
137
196
buf = IOBuffer (
197
+ # A common usecase of IOBuffer is to incrementally construct strings. By using StringMemory
198
+ # as the default storage, we can turn the result into a string without copying.
138
199
StringMemory (size),
139
200
read= flags. read,
140
201
write= flags. write,
@@ -247,6 +308,7 @@ function read_sub(from::GenericIOBuffer, a::AbstractArray{T}, offs, nel) where T
247
308
if offs+ nel- 1 > length (a) || offs < 1 || nel < 0
248
309
throw (BoundsError ())
249
310
end
311
+ # Use memcpy where applicable for performance
250
312
if isa (a, MutableDenseArrayType{UInt8})
251
313
nb = UInt (nel * sizeof (T))
252
314
GC. @preserve a unsafe_read (from, pointer (a, offs), nb)
@@ -283,18 +345,29 @@ read(from::GenericIOBuffer, ::Type{Ptr{T}}) where {T} = convert(Ptr{T}, read(fro
283
345
isreadable (io:: GenericIOBuffer ) = io. readable
284
346
iswritable (io:: GenericIOBuffer ) = io. writable
285
347
348
+ # Number of bytes that can be read from the buffer, if you seek to the start first.
286
349
filesize (io:: GenericIOBuffer ) = (io. seekable ? io. size - io. offset : bytesavailable (io))
350
+
351
+ # Number of bytes that can be read from the buffer.
287
352
bytesavailable (io:: GenericIOBuffer ) = io. size - io. ptr + 1
353
+
354
+ # Position is zero-indexed, but ptr is one-indexed, hence the -1
288
355
position (io:: GenericIOBuffer ) = io. ptr - io. offset - 1
289
356
290
357
function skip (io:: GenericIOBuffer , n:: Integer )
291
358
skip (io, clamp (n, Int))
292
359
end
360
+
293
361
function skip (io:: GenericIOBuffer , n:: Int )
362
+ # In both cases, the result will never go to before the first position,
363
+ # nor beyond the last position, and will not throw an error unless the stream
364
+ # is not seekable and try to skip a negative number of bytes.
294
365
if signbit (n)
366
+ # Skipping a negative number of bytes is equivalent to seeking backwards.
295
367
seekto = clamp (widen (position (io)) + widen (n), Int)
296
368
seek (io, seekto) # Does error checking
297
369
else
370
+ # Don't use seek in order to allow a non-seekable IO to still skip bytes.
298
371
n_max = io. size + 1 - io. ptr
299
372
io. ptr += min (n, n_max)
300
373
io
304
377
function seek (io:: GenericIOBuffer , n:: Integer )
305
378
seek (io, clamp (n, Int))
306
379
end
380
+
307
381
function seek (io:: GenericIOBuffer , n:: Int )
308
382
if ! io. seekable
309
383
ismarked (io) || throw (ArgumentError (" seek failed, IOBuffer is not seekable and is not marked" ))
@@ -329,6 +403,7 @@ function _resize!(io::GenericIOBuffer, sz::Int)
329
403
a = io. data
330
404
offset = io. offset
331
405
if applicable (resize!, a, sz)
406
+ # TODO : This is buggy: The buffer should never touch data before the offset.
332
407
if offset != 0
333
408
size = io. size
334
409
size > offset && copyto! (a, 1 , a, offset + 1 , min (sz, size - offset))
@@ -339,6 +414,7 @@ function _resize!(io::GenericIOBuffer, sz::Int)
339
414
resize! (a, sz)
340
415
else
341
416
size = io. size
417
+ # Make a new data buffer, only if there is not room in existing buffer
342
418
if size >= sz && sz != 0
343
419
b = a
344
420
else
@@ -373,9 +449,15 @@ function truncate(io::GenericIOBuffer, n::Integer)
373
449
return io
374
450
end
375
451
452
+ # Internal method. Delete used data in the buffer, shifting the rest to the left.
453
+ # Does not delete data at or after the mark, nor data before the offset, nor data
454
+ # after ptr.
376
455
function compact (io:: GenericIOBuffer )
456
+ # For a seekable buffer, the user could always seek back to the used data.
457
+ # Therefore, it is invalid to compact a seekable buffer
377
458
io. writable || throw (ArgumentError (" compact failed, IOBuffer is not writeable" ))
378
459
io. seekable && throw (ArgumentError (" compact failed, IOBuffer is seekable" ))
460
+ # If the data is invalid and needs to be replaced, no point in compacting
379
461
io. reinit && return
380
462
local ptr:: Int , bytes_to_move:: Int
381
463
if ismarked (io) && io. mark < position (io)
@@ -386,6 +468,7 @@ function compact(io::GenericIOBuffer)
386
468
ptr = io. ptr
387
469
bytes_to_move = bytesavailable (io)
388
470
end
471
+ # TODO : Invalid: Buffer must not touch data before offset
389
472
copyto! (io. data, 1 , io. data, ptr, bytes_to_move)
390
473
io. size -= ptr - 1
391
474
io. ptr -= ptr - 1
@@ -474,19 +557,30 @@ julia> String(take!(io))
474
557
function take! (io:: GenericIOBuffer )
475
558
ismarked (io) && unmark (io)
476
559
if io. seekable
560
+ # If the buffer is seekable, then the previously consumed bytes from ptr+1:size
561
+ # must still be output, as they are not truly gone.
562
+ # Hence, we output all bytes from offset+1:io.size
477
563
nbytes = io. size - io. offset
478
564
data = copyto! (StringVector (nbytes), 1 , io. data, io. offset + 1 , nbytes)
479
565
else
566
+ # Else, if not seekable, bytes from offset+1:ptr-1 are truly gone and should not
567
+ # be output. Hence, we output `bytesavailable`, which is ptr:size
480
568
nbytes = bytesavailable (io)
481
569
data = read! (io, StringVector (nbytes))
482
570
end
571
+ # TODO : Why do we not reinit here? The user has taken control of the buffer
572
+ # so it's not safe to hold onto it.
483
573
if io. writable
484
574
io. ptr = 1
485
575
io. size = 0
486
576
io. offset = 0
487
577
end
488
578
return data
489
579
end
580
+
581
+ # This method is specialized because we know the underlying data is a Memory, so we can
582
+ # e.g. wrap directly in an array without copying. Otherwise the logic is the same as
583
+ # the generic method
490
584
function take! (io:: IOBuffer )
491
585
ismarked (io) && unmark (io)
492
586
if io. seekable
@@ -552,6 +646,7 @@ function unsafe_write(to::GenericIOBuffer, p::Ptr{UInt8}, nb::UInt)
552
646
written = Int (min (nb, Int (length (to. data)):: Int - ptr + 1 ))
553
647
towrite = written
554
648
d = to. data
649
+ # TODO : This inbounds is unsafe, since the underlying data may be of any type
555
650
while towrite > 0
556
651
@inbounds d[ptr] = unsafe_load (p)
557
652
ptr += 1
@@ -565,6 +660,9 @@ function unsafe_write(to::GenericIOBuffer, p::Ptr{UInt8}, nb::UInt)
565
660
return written
566
661
end
567
662
663
+ # TODO : We should have a method that uses memcpy (`copyto!`) for the IOBuffer case.
664
+ # Preliminary testing suggests this would be ~10x faster than the current implementation
665
+
568
666
@inline function write (to:: GenericIOBuffer , a:: UInt8 )
569
667
ensureroom (to, UInt (1 ))
570
668
ptr = (to. append ? to. size+ 1 : to. ptr)
@@ -590,15 +688,21 @@ function readbytes!(io::GenericIOBuffer, b::MutableDenseArrayType{UInt8}, nb::In
590
688
return nr
591
689
end
592
690
read (io:: GenericIOBuffer ) = read! (io, StringVector (bytesavailable (io)))
691
+
692
+ # For IO buffers, all the data is immediately available.
593
693
readavailable (io:: GenericIOBuffer ) = read (io)
694
+
594
695
read (io:: GenericIOBuffer , nb:: Integer ) = read! (io, StringVector (min (nb, bytesavailable (io))))
595
696
596
697
function occursin (delim:: UInt8 , buf:: IOBuffer )
698
+ # TODO : This pointer call is invalid. Also, perhaps this should use the default `in` method,
699
+ # which already should be implemented using memchr.
597
700
p = pointer (buf. data, buf. ptr)
598
701
q = GC. @preserve buf ccall (:memchr , Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, delim, bytesavailable (buf))
599
702
return q != C_NULL
600
703
end
601
704
705
+ # TODO : Invalid use of @inbounds, and also we can use the generic `in` method.
602
706
function occursin (delim:: UInt8 , buf:: GenericIOBuffer )
603
707
data = buf. data
604
708
for i = buf. ptr: buf. size
@@ -625,6 +729,8 @@ function copyline(out::GenericIOBuffer, s::IO; keep::Bool=false)
625
729
copyuntil (out, s, 0x0a , keep= true )
626
730
line = out. data
627
731
i = out. size # XXX : this is only correct for appended data. if the data was inserted, only ptr should change
732
+ # TODO : This computation seems to be invalid. The buffer size may be smaller than out.size
733
+ # due to the presence of out.offset.
628
734
if keep || i == out. offset || line[i] != 0x0a
629
735
return out
630
736
elseif i < 2 || line[i- 1 ] != 0x0d
@@ -644,6 +750,7 @@ function _copyline(out::IO, io::GenericIOBuffer; keep::Bool=false)
644
750
# note: findfirst + copyto! is much faster than a single loop
645
751
# except for nout ≲ 20. A single loop is 2x faster for nout=5.
646
752
nout = nread = something (findfirst (== (0x0a ), data), length (data))
753
+ # Remove the 0x0a (newline) if not keep, and also remove the 0x0d (\r) if it is there
647
754
if ! keep && nout > 0 && data[nout] == 0x0a
648
755
nout -= 1
649
756
nout > 0 && data[nout] == 0x0d && (nout -= 1 )
@@ -652,6 +759,7 @@ function _copyline(out::IO, io::GenericIOBuffer; keep::Bool=false)
652
759
io. ptr += nread
653
760
return out
654
761
end
762
+
655
763
copyline (out:: IO , io:: GenericIOBuffer ; keep:: Bool = false ) = _copyline (out, io; keep)
656
764
copyline (out:: GenericIOBuffer , io:: GenericIOBuffer ; keep:: Bool = false ) = _copyline (out, io; keep)
657
765
0 commit comments