Skip to content

Commit 1a7f1ca

Browse files
authored
Fixes for inefficient reads of ZIP entries (#273)
1 parent 6fb2d08 commit 1a7f1ca

File tree

13 files changed

+141
-36
lines changed

13 files changed

+141
-36
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ require (
1111
github.com/aws/smithy-go v1.23.1
1212
github.com/azr/phash v0.2.0
1313
github.com/bbrks/go-blurhash v1.1.1
14+
github.com/chocolatkey/gzran v0.0.0-20251204101541-d8891e235711
1415
github.com/deckarep/golang-set v1.8.0
1516
github.com/disintegration/imaging v1.6.2
1617
github.com/go-viper/mapstructure/v2 v2.4.0

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ github.com/bbrks/go-blurhash v1.1.1/go.mod h1:lkAsdyXp+EhARcUo85yS2G1o+Sh43I2ebF
8585
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
8686
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
8787
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
88+
github.com/chocolatkey/gzran v0.0.0-20251204101541-d8891e235711 h1:KXBH2rdtVs70qr55arSwgrXZq6QasYgox1GbYdi3kRg=
89+
github.com/chocolatkey/gzran v0.0.0-20251204101541-d8891e235711/go.mod h1:jk2T+gAWOv82T5A5XU+h/bA+9ngcj+DkHNrP/Ktyt88=
8890
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
8991
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
9092
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=

pkg/archive/archive.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ type Entry interface {
8080
StreamCompressedGzip(w io.Writer) (int64, error) // Streams the compressed content of this entry to a writer in a GZIP container.
8181
ReadCompressed() ([]byte, error) // Reads the compressed content of this entry.
8282
ReadCompressedGzip() ([]byte, error) // Reads the compressed content of this entry inside a GZIP container.
83+
84+
CRC32Checksum() *uint32 // Returns the CRC32 checksum of the uncompressed data.
8385
}
8486

8587
// Represents an immutable archive.

pkg/archive/archive_exploded.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ func (e explodedArchiveEntry) CompressedLength() uint64 {
2626
return 0
2727
}
2828

29+
func (e explodedArchiveEntry) CRC32Checksum() *uint32 {
30+
return nil
31+
}
32+
2933
func (e explodedArchiveEntry) CompressedAs(compressionMethod CompressionMethod) bool {
3034
return false
3135
}
@@ -54,7 +58,11 @@ func (e explodedArchiveEntry) Read(start int64, end int64) ([]byte, error) {
5458
}
5559
}
5660
data := make([]byte, end-start+1)
57-
n, err := f.Read(data)
61+
n, err := io.ReadFull(f, data)
62+
if n > 0 && err == io.ErrUnexpectedEOF {
63+
// Not EOF error if some data was read
64+
err = nil
65+
}
5866
return data[:n], err
5967
}
6068

pkg/archive/archive_zip.go

Lines changed: 82 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -11,30 +11,39 @@ import (
1111
"path"
1212
"sync"
1313

14+
"github.com/chocolatkey/gzran"
1415
"github.com/pkg/errors"
1516
)
1617

1718
type gozipArchiveEntry struct {
1819
file *zip.File
1920
minimizeReads bool
21+
22+
gi gzran.Index
23+
gm sync.Mutex
2024
}
2125

22-
func (e gozipArchiveEntry) Path() string {
26+
func (e *gozipArchiveEntry) Path() string {
2327
return path.Clean(e.file.Name)
2428
}
2529

26-
func (e gozipArchiveEntry) Length() uint64 {
30+
func (e *gozipArchiveEntry) Length() uint64 {
2731
return e.file.UncompressedSize64
2832
}
2933

30-
func (e gozipArchiveEntry) CompressedLength() uint64 {
34+
func (e *gozipArchiveEntry) CompressedLength() uint64 {
3135
if e.file.Method == zip.Store {
3236
return 0
3337
}
3438
return e.file.CompressedSize64
3539
}
3640

37-
func (e gozipArchiveEntry) CompressedAs(compressionMethod CompressionMethod) bool {
41+
func (e *gozipArchiveEntry) CRC32Checksum() *uint32 {
42+
c := e.file.CRC32
43+
return &c
44+
}
45+
46+
func (e *gozipArchiveEntry) CompressedAs(compressionMethod CompressionMethod) bool {
3847
if compressionMethod != CompressionMethodDeflate {
3948
return false
4049
}
@@ -45,11 +54,11 @@ func (e gozipArchiveEntry) CompressedAs(compressionMethod CompressionMethod) boo
4554
// It's especially useful when trying to stream the ZIP from a remote file, e.g.
4655
// cloud storage. It's only enabled when trying to read the entire file and compression
4756
// is enabled. Care needs to be taken to cover every edge case.
48-
func (e gozipArchiveEntry) couldMinimizeReads() bool {
57+
func (e *gozipArchiveEntry) couldMinimizeReads() bool {
4958
return e.minimizeReads && e.CompressedLength() > 0
5059
}
5160

52-
func (e gozipArchiveEntry) Read(start int64, end int64) ([]byte, error) {
61+
func (e *gozipArchiveEntry) Read(start int64, end int64) ([]byte, error) {
5362
if end < start {
5463
return nil, errors.New("range not satisfiable")
5564
}
@@ -73,14 +82,60 @@ func (e gozipArchiveEntry) Read(start int64, end int64) ([]byte, error) {
7382
}
7483

7584
if minimizeReads {
76-
compressedData := make([]byte, e.file.CompressedSize64)
77-
_, err := io.ReadFull(f, compressedData)
78-
if err != nil {
79-
return nil, err
85+
// If uncompressed size is smaller than 1MB, it's not worth
86+
// using deflate random access, because the state itself takes memory.
87+
// We also skip using zrand logic if the entire file is being requested,
88+
// becaus that means the client probably won't need a partial range
89+
if e.file.UncompressedSize64 < ZRandCutoff || (start == 0 && (end == 0 || end == int64(e.file.UncompressedSize64-1))) {
90+
compressedData := make([]byte, e.file.CompressedSize64)
91+
_, err := io.ReadFull(f, compressedData)
92+
if err != nil {
93+
return nil, err
94+
}
95+
frdr := flate.NewReader(bytes.NewReader(compressedData))
96+
defer frdr.Close()
97+
f = frdr
98+
} else {
99+
e.gm.Lock()
100+
var lastCompressedOffset int64
101+
for _, v := range e.gi {
102+
if v.CompressedOffset > lastCompressedOffset && v.UncompressedOffset <= start {
103+
lastCompressedOffset = v.CompressedOffset
104+
}
105+
}
106+
e.gm.Unlock()
107+
108+
compressedData := make([]byte, e.file.CompressedSize64)
109+
f.(io.Seeker).Seek(lastCompressedOffset, io.SeekStart)
110+
_, err := io.ReadFull(f, compressedData[lastCompressedOffset:])
111+
if err != nil {
112+
return nil, err
113+
}
114+
115+
// This special reader lets us restore the decompressor state at known offsets
116+
// which is useful when a client has already requested previous parts of the file,
117+
// such as when a web browser requests subsequent byte ranges for media playback.
118+
fzr, err := gzran.NewDReader(bytes.NewReader(compressedData)) // Default interval = 1MB, same as current ZRandCutoff
119+
if err != nil {
120+
return nil, err
121+
}
122+
// Note: if an implementor uses the same publication instance for all clients,
123+
// this code will lock all clients. This could be problematic and should be
124+
// mitigated in a future version. For us to get this far is pretty rare though,
125+
// and mainly applies to multimedia that is natively streamed by web browsers and
126+
// was also inconveniently compressed by the original author of the ZIP.
127+
e.gm.Lock()
128+
defer e.gm.Unlock()
129+
defer func() {
130+
e.gi = fzr.Index
131+
}()
132+
defer fzr.Close()
133+
if len(e.gi) > 0 {
134+
fzr.Index = e.gi
135+
}
136+
137+
f = fzr
80138
}
81-
frdr := flate.NewReader(bytes.NewReader(compressedData))
82-
defer frdr.Close()
83-
f = frdr
84139
}
85140

86141
if start == 0 && end == 0 {
@@ -92,21 +147,25 @@ func (e gozipArchiveEntry) Read(start int64, end int64) ([]byte, error) {
92147
return data, nil
93148
}
94149
if start > 0 {
95-
_, err := io.CopyN(io.Discard, f, start)
150+
if skr, ok := f.(io.Seeker); ok {
151+
_, err = skr.Seek(start, io.SeekStart)
152+
} else {
153+
_, err = io.CopyN(io.Discard, f, start)
154+
}
96155
if err != nil {
97156
return nil, err
98157
}
99158
}
100159
data := make([]byte, end-start+1)
101-
n, err := f.Read(data)
102-
if n > 0 && err == io.EOF {
160+
n, err := io.ReadFull(f, data)
161+
if n > 0 && err == io.ErrUnexpectedEOF {
103162
// Not EOF error if some data was read
104163
err = nil
105164
}
106165
return data[:n], err
107166
}
108167

109-
func (e gozipArchiveEntry) Stream(w io.Writer, start int64, end int64) (int64, error) {
168+
func (e *gozipArchiveEntry) Stream(w io.Writer, start int64, end int64) (int64, error) {
110169
if end < start {
111170
return -1, errors.New("range not satisfiable")
112171
}
@@ -157,7 +216,7 @@ func (e gozipArchiveEntry) Stream(w io.Writer, start int64, end int64) (int64, e
157216
return n, err
158217
}
159218

160-
func (e gozipArchiveEntry) StreamCompressed(w io.Writer) (int64, error) {
219+
func (e *gozipArchiveEntry) StreamCompressed(w io.Writer) (int64, error) {
161220
if e.file.Method != zip.Deflate {
162221
return -1, errors.New("not a compressed resource")
163222
}
@@ -169,7 +228,7 @@ func (e gozipArchiveEntry) StreamCompressed(w io.Writer) (int64, error) {
169228
return io.Copy(w, f)
170229
}
171230

172-
func (e gozipArchiveEntry) StreamCompressedGzip(w io.Writer) (int64, error) {
231+
func (e *gozipArchiveEntry) StreamCompressedGzip(w io.Writer) (int64, error) {
173232
if e.file.Method != zip.Deflate {
174233
return -1, errors.New("not a compressed resource")
175234
}
@@ -205,7 +264,7 @@ func (e gozipArchiveEntry) StreamCompressedGzip(w io.Writer) (int64, error) {
205264
return int64(n) + nn + int64(nnn), nil
206265
}
207266

208-
func (e gozipArchiveEntry) ReadCompressed() ([]byte, error) {
267+
func (e *gozipArchiveEntry) ReadCompressed() ([]byte, error) {
209268
if e.file.Method != zip.Deflate {
210269
return nil, errors.New("not a compressed resource")
211270
}
@@ -223,7 +282,7 @@ func (e gozipArchiveEntry) ReadCompressed() ([]byte, error) {
223282
return compressedData, nil
224283
}
225284

226-
func (e gozipArchiveEntry) ReadCompressedGzip() ([]byte, error) {
285+
func (e *gozipArchiveEntry) ReadCompressedGzip() ([]byte, error) {
227286
if e.file.Method != zip.Deflate {
228287
return nil, errors.New("not a compressed resource")
229288
}
@@ -280,7 +339,7 @@ func (a *gozipArchive) Entries() []Entry {
280339

281340
aentry, ok := a.cachedEntries.Load(f.Name)
282341
if !ok {
283-
aentry = gozipArchiveEntry{
342+
aentry = &gozipArchiveEntry{
284343
file: f,
285344
minimizeReads: a.minimizeReads,
286345
}
@@ -307,7 +366,7 @@ func (a *gozipArchive) Entry(p string) (Entry, error) {
307366
for _, f := range a.zip.File {
308367
fp := path.Clean(f.Name)
309368
if fp == cpath {
310-
aentry := gozipArchiveEntry{
369+
aentry := &gozipArchiveEntry{
311370
file: f,
312371
minimizeReads: a.minimizeReads,
313372
}

pkg/archive/gzip.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,9 @@ const (
88
gzipDeflate = 8
99
)
1010

11-
const GzipWrapperLength = 18
11+
const GzipHeaderLength = 10
12+
const GzipTrailerLength = 8
13+
const GzipWrapperLength = GzipHeaderLength + GzipTrailerLength
1214
const GzipMaxLength = math.MaxUint32
15+
16+
const ZRandCutoff = 1024 * 1024 // 1MB

pkg/fetcher/fetcher_archive.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,11 @@ func (r *entryResource) CompressedLength(ctx context.Context) int64 {
194194
return int64(r.entry.CompressedLength())
195195
}
196196

197+
// CRC32Checksum implements CompressedResource
198+
func (r *entryResource) CRC32Checksum(ctx context.Context) *uint32 {
199+
return r.entry.CRC32Checksum()
200+
}
201+
197202
// StreamCompressed implements CompressedResource
198203
func (r *entryResource) StreamCompressed(ctx context.Context, w io.Writer) (int64, *ResourceError) {
199204
i, err := r.entry.StreamCompressed(w)

pkg/fetcher/fetcher_file.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,8 @@ func (r *FileResource) Read(ctx context.Context, start int64, end int64) ([]byte
199199
}
200200
return data[:n], nil
201201
} else {
202-
n, err := f.Read(data)
203-
if err != nil && err != io.EOF {
202+
n, err := io.ReadFull(f, data)
203+
if err != nil && err != io.ErrUnexpectedEOF {
204204
return nil, Other(err)
205205
}
206206
return data[:n], nil

pkg/fetcher/resource.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,15 @@ func (r ProxyResource) CompressedLength(ctx context.Context) int64 {
351351
return cres.CompressedLength(ctx)
352352
}
353353

354+
// CRC32Checksum implements CompressedResource
355+
func (r ProxyResource) CRC32Checksum(ctx context.Context) *uint32 {
356+
cres, ok := r.Res.(CompressedResource)
357+
if !ok {
358+
return nil
359+
}
360+
return cres.CRC32Checksum(ctx)
361+
}
362+
354363
// StreamCompressed implements CompressedResource
355364
func (r ProxyResource) StreamCompressed(ctx context.Context, w io.Writer) (int64, *ResourceError) {
356365
cres, ok := r.Res.(CompressedResource)
@@ -578,6 +587,15 @@ func (r *LazyResource) CompressedLength(ctx context.Context) int64 {
578587
return cres.CompressedLength(ctx)
579588
}
580589

590+
// CRC32Checksum implements CompressedResource
591+
func (r *LazyResource) CRC32Checksum(ctx context.Context) *uint32 {
592+
cres, ok := r.resource().(CompressedResource)
593+
if !ok {
594+
return nil
595+
}
596+
return cres.CRC32Checksum(ctx)
597+
}
598+
581599
// StreamCompressed implements CompressedResource
582600
func (r *LazyResource) StreamCompressed(ctx context.Context, w io.Writer) (int64, *ResourceError) {
583601
cres, ok := r.resource().(CompressedResource)

pkg/fetcher/traits.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ type CompressedResource interface {
1414
StreamCompressedGzip(ctx context.Context, w io.Writer) (int64, *ResourceError)
1515
ReadCompressed(ctx context.Context) ([]byte, *ResourceError)
1616
ReadCompressedGzip(ctx context.Context) ([]byte, *ResourceError)
17+
CRC32Checksum(ctx context.Context) *uint32
1718
}

0 commit comments

Comments
 (0)