Skip to content

Commit b7f911c

Browse files
committed
feat: add --bfs option to mc mirror for layer-by-layer traversal in S3 client operations
closes #4873
1 parent e929f89 commit b7f911c

File tree

6 files changed

+205
-6
lines changed

6 files changed

+205
-6
lines changed

cmd/client-s3.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1839,6 +1839,10 @@ func (c *S3Client) listVersionsRoutine(ctx context.Context, b, o string, opts Li
18391839
buckets = append(buckets, b)
18401840
}
18411841

1842+
if opts.Prefix != "" {
1843+
o = opts.Prefix
1844+
}
1845+
18421846
for _, b := range buckets {
18431847
var skipKey string
18441848
for objectVersion := range c.api.ListObjects(ctx, b, minio.ListObjectsOptions{
@@ -2104,6 +2108,10 @@ func (c *S3Client) listIncompleteInRoutine(ctx context.Context, contentCh chan *
21042108
func (c *S3Client) listIncompleteRecursiveInRoutine(ctx context.Context, contentCh chan *ClientContent, opts ListOptions) {
21052109
// get bucket and object from URL.
21062110
b, o := c.url2BucketAndObject()
2111+
if opts.Prefix != "" {
2112+
o = opts.Prefix
2113+
}
2114+
21072115
switch {
21082116
case b == "" && o == "":
21092117
buckets, err := c.api.ListBuckets(ctx)
@@ -2243,6 +2251,7 @@ func (c *S3Client) objectInfo2ClientContent(bucket string, entry minio.ObjectInf
22432251
}
22442252
url.Path = c.buildAbsPath(bucket, entry.Key)
22452253
content.URL = url
2254+
content.ObjectKey = entry.Key
22462255
content.BucketName = bucket
22472256
content.Size = entry.Size
22482257
content.ETag = entry.ETag
@@ -2321,6 +2330,10 @@ func (c *S3Client) bucketStat(ctx context.Context, opts BucketStatOptions) (*Cli
23212330
func (c *S3Client) listInRoutine(ctx context.Context, contentCh chan *ClientContent, opts ListOptions) {
23222331
// get bucket and object from URL.
23232332
b, o := c.url2BucketAndObject()
2333+
if opts.Prefix != "" {
2334+
o = opts.Prefix
2335+
}
2336+
23242337
if opts.ListZip && (b == "" || o == "") {
23252338
contentCh <- &ClientContent{
23262339
Err: probe.NewError(errors.New("listing zip files must provide bucket and object")),
@@ -2385,6 +2398,10 @@ func sortBucketsNameWithSlash(bucketsInfo []minio.BucketInfo) {
23852398
func (c *S3Client) listRecursiveInRoutine(ctx context.Context, contentCh chan *ClientContent, opts ListOptions) {
23862399
// get bucket and object from URL.
23872400
b, o := c.url2BucketAndObject()
2401+
if opts.Prefix != "" {
2402+
o = opts.Prefix
2403+
}
2404+
23882405
switch {
23892406
case b == "" && o == "":
23902407
buckets, err := c.api.ListBuckets(ctx)

cmd/client-url.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -190,10 +190,14 @@ func (u ClientURL) String() string {
190190
}
191191

192192
// urlJoinPath Join a path to existing URL.
193-
func urlJoinPath(url1, url2 string) string {
194-
u1 := newClientURL(url1)
195-
u2 := newClientURL(url2)
196-
return joinURLs(u1, u2).String()
193+
func urlJoinPath(base, element string) string {
194+
if strings.HasSuffix(base, "/") && strings.HasPrefix(element, "/") {
195+
return base + element[1:]
196+
}
197+
if !strings.HasSuffix(base, "/") && !strings.HasPrefix(element, "/") {
198+
return base + "/" + element
199+
}
200+
return base + element
197201
}
198202

199203
// url2Stat returns stat info for URL - supports bucket, object and a prefixe with or without a trailing slash

cmd/client.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ type ListOptions struct {
108108
TimeRef time.Time
109109
ShowDir DirOpt
110110
Count int
111+
Prefix string // Add prefix support
111112
}
112113

113114
// CopyOptions holds options for copying operation
@@ -213,6 +214,7 @@ type Client interface {
213214
// ClientContent - Content container for content metadata
214215
type ClientContent struct {
215216
URL ClientURL
217+
ObjectKey string
216218
BucketName string // only valid and set for client-type objectStorage
217219
Time time.Time
218220
Size int64

cmd/difference.go

Lines changed: 171 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,171 @@ func getSourceModTimeKey(metadata map[string]string) string {
8282
return ""
8383
}
8484

85+
// layerDifference performs a breadth-first search (BFS) comparison between source and target.
86+
// Unlike the standard recursive listing approach, this function traverses the object hierarchy
87+
// layer by layer (directory by directory), which prevents overwhelming the server with
88+
// large recursive listing operations that could cause timeouts or connection failures.
89+
//
90+
// This approach is especially useful for buckets containing millions of objects where
91+
// a standard recursive listing might cause server-side resource exhaustion. By exploring
92+
// the hierarchy level by level and comparing objects at each layer, this function provides
93+
// a more scalable solution for large object stores.
94+
//
95+
// The BFS approach:
96+
// 1. Starts with the root prefix ("") for both source and target
97+
// 2. Lists objects at the current level/prefix (non-recursively)
98+
// 3. Compares objects found at this level
99+
// 4. Queues any directories found for exploration in the next iteration
100+
// 5. Continues until all directories in both source and target are explored
101+
//
102+
// This is enabled with the --bfs parameter to avoid the limitations of recursive listing.
103+
func layerDifference(ctx context.Context, sourceClnt, targetClnt Client, opts mirrorOptions) chan diffMessage {
104+
diffCh := make(chan diffMessage, 10000)
105+
106+
go func() {
107+
defer close(diffCh)
108+
109+
// Channels to feed items found by BFS into the difference engine
110+
srcClientCh := make(chan *ClientContent, 1000)
111+
tgtClientCh := make(chan *ClientContent, 1000)
112+
113+
// Goroutine to perform BFS on the source
114+
go func() {
115+
defer close(srcClientCh)
116+
// Queue for *relative object prefixes* to explore
117+
queue := []string{""} // "" represents the root prefix
118+
119+
for len(queue) > 0 {
120+
// Dequeue the next relative prefix
121+
prefix := queue[0]
122+
queue = queue[1:]
123+
124+
// List items at the current prefix level using the relative prefix
125+
listCtx, listCancel := context.WithCancel(ctx)
126+
contentsCh := sourceClnt.List(listCtx, ListOptions{
127+
Recursive: false, // List only the current level
128+
WithMetadata: opts.isMetadata,
129+
ShowDir: DirLast, // Ensure directories are listed
130+
Prefix: prefix, // Pass the relative prefix
131+
})
132+
133+
for content := range contentsCh {
134+
select {
135+
case <-ctx.Done():
136+
listCancel()
137+
return
138+
default:
139+
if content != nil && content.Err != nil {
140+
srcClientCh <- content
141+
listCancel()
142+
continue
143+
}
144+
if content == nil {
145+
continue
146+
}
147+
148+
// Send the valid content (file or directory) for comparison
149+
srcClientCh <- content
150+
151+
// If it's a directory, queue its *relative object key* for the next level
152+
if content.Type.IsDir() {
153+
relativeKey := content.ObjectKey // Get the relative key
154+
// Prevent infinite loops: don't re-queue the prefix we just listed,
155+
// especially the root ("") which might list itself as "/" depending on backend.
156+
// Also check if ObjectKey is populated.
157+
if relativeKey != "" && relativeKey != prefix {
158+
// Ensure the key ends with a separator if it's a directory prefix
159+
// The S3 ListObjects usually returns directory keys ending with '/'
160+
if !strings.HasSuffix(relativeKey, string(content.URL.Separator)) {
161+
// This case might indicate a non-standard directory representation, handle cautiously
162+
// For standard S3, common prefixes already end in '/'
163+
// If needed, append separator: relativeKey += string(content.URL.Separator)
164+
}
165+
// Add the relative key (prefix) to the queue
166+
queue = append(queue, relativeKey)
167+
}
168+
}
169+
}
170+
}
171+
listCancel()
172+
}
173+
}()
174+
175+
// Goroutine to perform BFS on the target (symmetric to the source)
176+
go func() {
177+
defer close(tgtClientCh)
178+
// Queue for *relative object prefixes*
179+
queue := []string{""}
180+
181+
for len(queue) > 0 {
182+
prefix := queue[0]
183+
queue = queue[1:]
184+
185+
listCtx, listCancel := context.WithCancel(ctx)
186+
contentsCh := targetClnt.List(listCtx, ListOptions{
187+
Recursive: false,
188+
WithMetadata: opts.isMetadata,
189+
ShowDir: DirLast,
190+
Prefix: prefix, // Pass the relative prefix
191+
})
192+
193+
for content := range contentsCh {
194+
select {
195+
case <-ctx.Done():
196+
listCancel()
197+
return
198+
default:
199+
if content != nil && content.Err != nil {
200+
tgtClientCh <- content
201+
listCancel()
202+
continue
203+
}
204+
if content == nil {
205+
continue
206+
}
207+
208+
tgtClientCh <- content
209+
210+
// If it's a directory, queue its *relative object key*
211+
if content.Type.IsDir() {
212+
relativeKey := content.ObjectKey
213+
if relativeKey != "" && relativeKey != prefix {
214+
// Ensure trailing slash if needed (usually present from S3 List)
215+
if !strings.HasSuffix(relativeKey, string(content.URL.Separator)) {
216+
// Handle non-standard directory representation if necessary
217+
}
218+
queue = append(queue, relativeKey)
219+
}
220+
}
221+
}
222+
}
223+
listCancel()
224+
}
225+
}()
226+
227+
// Comparison logic remains the same
228+
err := differenceInternal(
229+
sourceClnt.GetURL().String(),
230+
srcClientCh,
231+
targetClnt.GetURL().String(),
232+
tgtClientCh,
233+
opts,
234+
false, // returnSimilar is false
235+
diffCh,
236+
)
237+
238+
if err != nil {
239+
select {
240+
case <-ctx.Done():
241+
default:
242+
diffCh <- diffMessage{Error: err}
243+
}
244+
}
245+
}()
246+
247+
return diffCh
248+
}
249+
85250
// activeActiveModTimeUpdated tries to calculate if the object copy in the target
86251
// is older than the one in the source by comparing the modtime of the data.
87252
func activeActiveModTimeUpdated(src, dst *ClientContent) bool {
@@ -167,7 +332,12 @@ func bucketObjectDifference(ctx context.Context, sourceClnt, targetClnt Client)
167332
})
168333
}
169334

170-
func objectDifference(ctx context.Context, sourceClnt, targetClnt Client, opts mirrorOptions) (diffCh chan diffMessage) {
335+
func objectDifference(ctx context.Context, sourceClnt, targetClnt Client, opts mirrorOptions) chan diffMessage {
336+
if opts.bfs {
337+
// Use layer-by-layer difference for regular objects
338+
return layerDifference(ctx, sourceClnt, targetClnt, opts)
339+
}
340+
171341
sourceURL := sourceClnt.GetURL().String()
172342
sourceCh := sourceClnt.List(ctx, ListOptions{Recursive: true, WithMetadata: opts.isMetadata, ShowDir: DirNone})
173343

cmd/mirror-main.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,10 @@ var (
144144
Name: "skip-errors",
145145
Usage: "skip any errors when mirroring",
146146
},
147+
cli.BoolFlag{
148+
Name: "bfs",
149+
Usage: "using BFS for layer-by-layer traversal of files, suitable for large number of files",
150+
},
147151
checksumFlag,
148152
}
149153
)
@@ -212,7 +216,7 @@ EXAMPLES:
212216
{{.Prompt}} {{.HelpName}} --older-than 30d s3/test ~/test
213217
214218
13. Mirror server encrypted objects from Amazon S3 cloud storage to a bucket on Amazon S3 cloud storage
215-
{{.Prompt}} {{.HelpName}} --enc-c "minio/archive=MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDA" --enc-c "s3/archive=MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5BBB" s3/archive/ minio/archive/
219+
{{.Prompt}} {{.HelpName}} --enc-c "minio/archive=MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDA" --enc-c "s3/archive=MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5BBB" s3/archive/ minio/archive/
216220
217221
14. Update 'Cache-Control' header on all existing objects recursively.
218222
{{.Prompt}} {{.HelpName}} --attr "Cache-Control=max-age=90000,min-fresh=9000" myminio/video-files myminio/video-files
@@ -1024,6 +1028,7 @@ func runMirror(ctx context.Context, srcURL, dstURL string, cli *cli.Context, enc
10241028
userMetadata: userMetadata,
10251029
encKeyDB: encKeyDB,
10261030
activeActive: isWatch,
1031+
bfs: cli.Bool("bfs"),
10271032
}
10281033

10291034
// If we are not using active/active and we are not removing

cmd/mirror-url.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ type mirrorOptions struct {
278278
userMetadata map[string]string
279279
checksum minio.ChecksumType
280280
sourceListingOnly bool
281+
bfs bool
281282
}
282283

283284
// Prepares urls that need to be copied or removed based on requested options.

0 commit comments

Comments
 (0)