From fe18dabf4bd5c6e6df311691542637bed6acfe39 Mon Sep 17 00:00:00 2001
From: Will Scott <will.scott@protocol.ai>
Date: Sun, 6 Aug 2023 12:41:20 +0100
Subject: [PATCH 1/4] Add EstimateUnixFSFileDefaultChunking for estimating how
 many bytes the car file representing a unixfs file of a given size will take
 Fix #58

---
 data/builder/file.go      | 86 +++++++++++++++++++++++++++++++++++++++
 data/builder/file_test.go | 72 ++++++++++++++++++++++++++++++--
 go.mod                    |  3 +-
 go.sum                    |  2 +
 4 files changed, 159 insertions(+), 4 deletions(-)

diff --git a/data/builder/file.go b/data/builder/file.go
index a522784..323b147 100644
--- a/data/builder/file.go
+++ b/data/builder/file.go
@@ -13,6 +13,7 @@ import (
 	basicnode "github.com/ipld/go-ipld-prime/node/basic"
 	"github.com/multiformats/go-multicodec"
 	multihash "github.com/multiformats/go-multihash/core"
+	"github.com/multiformats/go-varint"
 
 	// raw needed for opening as bytes
 	_ "github.com/ipld/go-ipld-prime/codec/raw"
@@ -57,6 +58,91 @@ func BuildUnixFSFile(r io.Reader, chunker string, ls *ipld.LinkSystem) (ipld.Lin
 	}
 }
 
+// EstimateUnixFSFile estimates the byte size of the car file that would be
+// needed to hold a UnixFS file containing data of the given length.
+func EstimateUnixFSFileDefaultChunking(dataLength uint64) uint64 {
+	blkSize := chunk.DefaultBlockSize
+	blocks := dataLength / uint64(blkSize)
+	remainder := dataLength % uint64(blkSize)
+
+	size := dataLength
+	cidExample, _ := leafLinkProto.Prefix.Sum([]byte{0})
+	cidLength := uint64(len(cidExample.Bytes()))
+
+	links := []uint64{}
+	for i := uint64(0); i < blocks; i++ {
+		links = append(links, uint64(chunk.DefaultBlockSize))
+	}
+	// account for the uvarint + cid length of each block of raw data.
+	size += uint64(len(links)) * (cidLength + uint64(varint.UvarintSize(cidLength+uint64(blkSize))))
+	if remainder > 0 {
+		links = append(links, remainder)
+		size += cidLength + uint64(varint.UvarintSize(cidLength+uint64(remainder)))
+	}
+
+	// account for the metadata overhead nodes.
+	ls := cidlink.DefaultLinkSystem()
+	storage := cidlink.Memory{}
+	ls.StorageReadOpener = storage.OpenRead
+	ls.StorageWriteOpener = storage.OpenWrite
+
+	icnt := 0
+	for len(links) > 1 {
+		nxtLnks := []uint64{}
+		for len(links) > 1 {
+			icnt++
+			children := uint64(DefaultLinksPerBlock)
+			if len(links) < DefaultLinksPerBlock {
+				children = uint64(len(links))
+			}
+			childrenLinks := links[:children]
+			links = links[children:]
+			totalSize := uint64(0)
+			for _, l := range childrenLinks {
+				totalSize += l
+			}
+
+			node, _ := BuildUnixFS(func(b *Builder) {
+				FileSize(b, totalSize)
+				BlockSizes(b, childrenLinks)
+			})
+
+			// Pack into the dagpb node.
+			dpbb := dagpb.Type.PBNode.NewBuilder()
+			pbm, _ := dpbb.BeginMap(2)
+			pblb, _ := pbm.AssembleEntry("Links")
+			pbl, _ := pblb.BeginList(int64(len(childrenLinks)))
+			for _, c := range childrenLinks {
+				pbln, _ := BuildUnixFSDirectoryEntry("", int64(c), cidlink.Link{Cid: cidExample})
+				pbl.AssembleValue().AssignNode(pbln)
+			}
+			pbl.Finish()
+			pbm.AssembleKey().AssignString("Data")
+			pbm.AssembleValue().AssignBytes(data.EncodeUnixFSData(node))
+			pbm.Finish()
+			pbn := dpbb.Build()
+			pbLnk := ls.MustStore(ipld.LinkContext{}, fileLinkProto, pbn)
+			pbRcrd, _ := ls.LoadRaw(ipld.LinkContext{}, pbLnk)
+
+			// dagpb overhead
+			intermediateNodeSize := uint64(len(pbRcrd))
+
+			size += intermediateNodeSize + cidLength + uint64(varint.UvarintSize(cidLength+intermediateNodeSize))
+			nxtLnks = append(nxtLnks, totalSize)
+		}
+		if len(links) == 1 {
+			nxtLnks = append(nxtLnks, links[0])
+		}
+		links = nxtLnks
+	}
+	fmt.Printf("estimated %d intermeidate nodes\n", icnt)
+
+	// add the car header
+	size += 59
+
+	return size
+}
+
 var fileLinkProto = cidlink.LinkPrototype{
 	Prefix: cid.Prefix{
 		Version:  1,
diff --git a/data/builder/file_test.go b/data/builder/file_test.go
index de3803e..db4208c 100644
--- a/data/builder/file_test.go
+++ b/data/builder/file_test.go
@@ -1,16 +1,27 @@
-package builder
+package builder_test
 
 import (
 	"bytes"
 	"context"
+	"fmt"
+	"io"
+	"math/rand"
 	"testing"
 
+	"github.com/ipfs/go-unixfsnode/data/builder"
+	"github.com/multiformats/go-multicodec"
+	multihash "github.com/multiformats/go-multihash/core"
+
 	"github.com/ipfs/go-cid"
 	u "github.com/ipfs/go-ipfs-util"
 	"github.com/ipfs/go-unixfsnode/file"
+	carv1 "github.com/ipld/go-car"
+	"github.com/ipld/go-car/v2"
 	dagpb "github.com/ipld/go-codec-dagpb"
 	"github.com/ipld/go-ipld-prime"
+	"github.com/ipld/go-ipld-prime/linking"
 	cidlink "github.com/ipld/go-ipld-prime/linking/cid"
+	selectorparse "github.com/ipld/go-ipld-prime/traversal/selector/parse"
 )
 
 func TestBuildUnixFSFile(t *testing.T) {
@@ -23,7 +34,7 @@ func TestBuildUnixFSFile(t *testing.T) {
 	ls.StorageReadOpener = storage.OpenRead
 	ls.StorageWriteOpener = storage.OpenWrite
 
-	f, _, err := BuildUnixFSFile(r, "", &ls)
+	f, _, err := builder.BuildUnixFSFile(r, "", &ls)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -43,6 +54,61 @@ func TestBuildUnixFSFile(t *testing.T) {
 	}
 }
 
+func TestEstimateUnixFSFileDefaultChunking(t *testing.T) {
+	for i := 100; i < 1000000000; i *= 10 {
+		b := make([]byte, i)
+		rand.Read(b)
+
+		ls := cidlink.DefaultLinkSystem()
+		storage := cidlink.Memory{}
+		ls.StorageReadOpener = storage.OpenRead
+		nPB := 0
+
+		ls.StorageWriteOpener = func(lc linking.LinkContext) (io.Writer, linking.BlockWriteCommitter, error) {
+			w, bwc, err := storage.OpenWrite(lc)
+			return w, func(lnk ipld.Link) error {
+				if lnk.(cidlink.Link).Cid.Prefix().Codec == uint64(multicodec.DagPb) {
+					nPB++
+				}
+				return bwc(lnk)
+			}, err
+		}
+		rt, _, err := builder.BuildUnixFSFile(bytes.NewReader(b), "", &ls)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		ob := bytes.NewBuffer(nil)
+		_, err = car.TraverseV1(context.Background(), &ls, rt.(cidlink.Link).Cid, selectorparse.CommonSelector_ExploreAllRecursively, ob)
+		if err != nil {
+			t.Fatal(err)
+		}
+		fileLen := len(ob.Bytes())
+
+		estimate := builder.EstimateUnixFSFileDefaultChunking(uint64(i))
+		if estimate != uint64(fileLen) {
+			fmt.Printf("%d intermediate nodes.\n", nPB)
+			t.Fatalf("estimate for file length %d was %d. should be %d", i, estimate, fileLen)
+		}
+	}
+}
+
+func TestS(t *testing.T) {
+	p := cid.Prefix{
+		Version:  1,
+		Codec:    uint64(multicodec.DagPb),
+		MhType:   multihash.SHA2_256,
+		MhLength: 32,
+	}
+	rt, _ := p.Sum([]byte{0})
+	ch := carv1.CarHeader{
+		Roots:   []cid.Cid{rt},
+		Version: 1,
+	}
+	s, _ := carv1.HeaderSize(&ch)
+	t.Fatalf("hs: %d\n", s)
+}
+
 func TestUnixFSFileRoundtrip(t *testing.T) {
 	buf := make([]byte, 10*1024*1024)
 	u.NewSeededRand(0xdeadbeef).Read(buf)
@@ -53,7 +119,7 @@ func TestUnixFSFileRoundtrip(t *testing.T) {
 	ls.StorageReadOpener = storage.OpenRead
 	ls.StorageWriteOpener = storage.OpenWrite
 
-	f, _, err := BuildUnixFSFile(r, "", &ls)
+	f, _, err := builder.BuildUnixFSFile(r, "", &ls)
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/go.mod b/go.mod
index c31842f..44f92a1 100644
--- a/go.mod
+++ b/go.mod
@@ -10,11 +10,13 @@ require (
 	github.com/ipfs/go-ipld-format v0.4.0
 	github.com/ipfs/go-merkledag v0.10.0
 	github.com/ipfs/go-unixfs v0.4.4
+	github.com/ipld/go-car v0.5.0
 	github.com/ipld/go-car/v2 v2.8.0
 	github.com/ipld/go-codec-dagpb v1.6.0
 	github.com/ipld/go-ipld-prime v0.20.0
 	github.com/multiformats/go-multicodec v0.8.1
 	github.com/multiformats/go-multihash v0.2.1
+	github.com/multiformats/go-varint v0.0.7
 	github.com/spaolacci/murmur3 v1.1.0
 	github.com/stretchr/testify v1.8.2
 	google.golang.org/protobuf v1.28.1
@@ -51,7 +53,6 @@ require (
 	github.com/multiformats/go-base32 v0.1.0 // indirect
 	github.com/multiformats/go-base36 v0.2.0 // indirect
 	github.com/multiformats/go-multibase v0.1.1 // indirect
-	github.com/multiformats/go-varint v0.0.7 // indirect
 	github.com/opentracing/opentracing-go v1.2.0 // indirect
 	github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
diff --git a/go.sum b/go.sum
index e193de6..8390b46 100644
--- a/go.sum
+++ b/go.sum
@@ -108,6 +108,8 @@ github.com/ipfs/go-unixfs v0.4.4 h1:D/dLBOJgny5ZLIur2vIXVQVW0EyDHdOMBDEhgHrt6rY=
 github.com/ipfs/go-unixfs v0.4.4/go.mod h1:TSG7G1UuT+l4pNj91raXAPkX0BhJi3jST1FDTfQ5QyM=
 github.com/ipfs/go-verifcid v0.0.2 h1:XPnUv0XmdH+ZIhLGKg6U2vaPaRDXb9urMyNVCE7uvTs=
 github.com/ipfs/go-verifcid v0.0.2/go.mod h1:40cD9x1y4OWnFXbLNJYRe7MpNvWlMn3LZAG5Wb4xnPU=
+github.com/ipld/go-car v0.5.0 h1:kcCEa3CvYMs0iE5BzD5sV7O2EwMiCIp3uF8tA6APQT8=
+github.com/ipld/go-car v0.5.0/go.mod h1:ppiN5GWpjOZU9PgpAZ9HbZd9ZgSpwPMr48fGRJOWmvE=
 github.com/ipld/go-car/v2 v2.8.0 h1:8tUI+VM1mAQ2Qa7ScK++lfyuZYcGQ70bZ6NpGOcJj5o=
 github.com/ipld/go-car/v2 v2.8.0/go.mod h1:a+BnAxUqgr7wcWxW/lI6ctyEQ2v9gjBChPytwFMp2f4=
 github.com/ipld/go-codec-dagpb v1.6.0 h1:9nYazfyu9B1p3NAgfVdpRco3Fs2nFC72DqVsMj6rOcc=

From 29ca6b6b91cea96099cf645914bb781fafa70fee Mon Sep 17 00:00:00 2001
From: Will Scott <will.scott@protocol.ai>
Date: Sun, 6 Aug 2023 12:45:07 +0100
Subject: [PATCH 2/4] fixup

---
 data/builder/file_test.go | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/data/builder/file_test.go b/data/builder/file_test.go
index db4208c..dd5c0fc 100644
--- a/data/builder/file_test.go
+++ b/data/builder/file_test.go
@@ -3,19 +3,17 @@ package builder_test
 import (
 	"bytes"
 	"context"
+	"crypto/rand"
 	"fmt"
 	"io"
-	"math/rand"
 	"testing"
 
 	"github.com/ipfs/go-unixfsnode/data/builder"
 	"github.com/multiformats/go-multicodec"
-	multihash "github.com/multiformats/go-multihash/core"
 
 	"github.com/ipfs/go-cid"
 	u "github.com/ipfs/go-ipfs-util"
 	"github.com/ipfs/go-unixfsnode/file"
-	carv1 "github.com/ipld/go-car"
 	"github.com/ipld/go-car/v2"
 	dagpb "github.com/ipld/go-codec-dagpb"
 	"github.com/ipld/go-ipld-prime"
@@ -93,22 +91,6 @@ func TestEstimateUnixFSFileDefaultChunking(t *testing.T) {
 	}
 }
 
-func TestS(t *testing.T) {
-	p := cid.Prefix{
-		Version:  1,
-		Codec:    uint64(multicodec.DagPb),
-		MhType:   multihash.SHA2_256,
-		MhLength: 32,
-	}
-	rt, _ := p.Sum([]byte{0})
-	ch := carv1.CarHeader{
-		Roots:   []cid.Cid{rt},
-		Version: 1,
-	}
-	s, _ := carv1.HeaderSize(&ch)
-	t.Fatalf("hs: %d\n", s)
-}
-
 func TestUnixFSFileRoundtrip(t *testing.T) {
 	buf := make([]byte, 10*1024*1024)
 	u.NewSeededRand(0xdeadbeef).Read(buf)

From 7427951466499e516309f12cd5f0f8715f3e94f7 Mon Sep 17 00:00:00 2001
From: Will Scott <will.scott@protocol.ai>
Date: Sun, 6 Aug 2023 12:47:08 +0100
Subject: [PATCH 3/4] tidy

---
 go.mod | 1 -
 go.sum | 2 --
 2 files changed, 3 deletions(-)

diff --git a/go.mod b/go.mod
index 44f92a1..c901eef 100644
--- a/go.mod
+++ b/go.mod
@@ -10,7 +10,6 @@ require (
 	github.com/ipfs/go-ipld-format v0.4.0
 	github.com/ipfs/go-merkledag v0.10.0
 	github.com/ipfs/go-unixfs v0.4.4
-	github.com/ipld/go-car v0.5.0
 	github.com/ipld/go-car/v2 v2.8.0
 	github.com/ipld/go-codec-dagpb v1.6.0
 	github.com/ipld/go-ipld-prime v0.20.0
diff --git a/go.sum b/go.sum
index 8390b46..e193de6 100644
--- a/go.sum
+++ b/go.sum
@@ -108,8 +108,6 @@ github.com/ipfs/go-unixfs v0.4.4 h1:D/dLBOJgny5ZLIur2vIXVQVW0EyDHdOMBDEhgHrt6rY=
 github.com/ipfs/go-unixfs v0.4.4/go.mod h1:TSG7G1UuT+l4pNj91raXAPkX0BhJi3jST1FDTfQ5QyM=
 github.com/ipfs/go-verifcid v0.0.2 h1:XPnUv0XmdH+ZIhLGKg6U2vaPaRDXb9urMyNVCE7uvTs=
 github.com/ipfs/go-verifcid v0.0.2/go.mod h1:40cD9x1y4OWnFXbLNJYRe7MpNvWlMn3LZAG5Wb4xnPU=
-github.com/ipld/go-car v0.5.0 h1:kcCEa3CvYMs0iE5BzD5sV7O2EwMiCIp3uF8tA6APQT8=
-github.com/ipld/go-car v0.5.0/go.mod h1:ppiN5GWpjOZU9PgpAZ9HbZd9ZgSpwPMr48fGRJOWmvE=
 github.com/ipld/go-car/v2 v2.8.0 h1:8tUI+VM1mAQ2Qa7ScK++lfyuZYcGQ70bZ6NpGOcJj5o=
 github.com/ipld/go-car/v2 v2.8.0/go.mod h1:a+BnAxUqgr7wcWxW/lI6ctyEQ2v9gjBChPytwFMp2f4=
 github.com/ipld/go-codec-dagpb v1.6.0 h1:9nYazfyu9B1p3NAgfVdpRco3Fs2nFC72DqVsMj6rOcc=

From df0de45f57d113412ed44099aefe35360f9f54e7 Mon Sep 17 00:00:00 2001
From: Will <will.scott@protocol.ai>
Date: Mon, 7 Aug 2023 07:58:14 +0000
Subject: [PATCH 4/4] Update data/builder/file.go

Co-authored-by: Masih H. Derkani <m@derkani.org>
---
 data/builder/file.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/builder/file.go b/data/builder/file.go
index 323b147..c512afd 100644
--- a/data/builder/file.go
+++ b/data/builder/file.go
@@ -69,7 +69,7 @@ func EstimateUnixFSFileDefaultChunking(dataLength uint64) uint64 {
 	cidExample, _ := leafLinkProto.Prefix.Sum([]byte{0})
 	cidLength := uint64(len(cidExample.Bytes()))
 
-	links := []uint64{}
+	var links []uint64
 	for i := uint64(0); i < blocks; i++ {
 		links = append(links, uint64(chunk.DefaultBlockSize))
 	}