Skip to content
This repository has been archived by the owner on Jun 27, 2023. It is now read-only.

Fix HAMT enumeration for in-memory traversal #110

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 66 additions & 33 deletions hamt/hamt.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ func (ds *Shard) Node() (ipld.Node, error) {
} else {
// child unloaded, just copy in link with updated name
lnk := ds.childer.link(sliceIndex)
label := lnk.Name[ds.maxpadlen:]
label := ds.extractLinkName(lnk)

err := out.AddRawLink(ds.linkNamePrefix(childIndex)+label, lnk)
if err != nil {
Expand All @@ -207,14 +207,18 @@ func (ds *Shard) Node() (ipld.Node, error) {
return out, nil
}

func (ds *Shard) extractLinkName(lnk *ipld.Link) string {
return lnk.Name[ds.maxpadlen:]
}

func (ds *Shard) makeShardValue(lnk *ipld.Link) (*Shard, error) {
lnk2 := *lnk
s, err := makeShard(ds.dserv, ds.tableSize)
if err != nil {
return nil, err
}

s.key = lnk.Name[ds.maxpadlen:]
s.key = ds.extractLinkName(lnk)
s.val = &lnk2

return s, nil
Expand Down Expand Up @@ -372,57 +376,85 @@ func (ds *Shard) EnumLinksAsync(ctx context.Context) <-chan format.LinkResult {
go func() {
defer close(linkResults)
defer cancel()
getLinks := makeAsyncTrieGetLinks(ds.dserv, linkResults)
getLinks := getLinksHAMTNode(ds, linkResults)
cset := cid.NewSet()
rootNode, err := ds.Node()
if err != nil {
emitResult(ctx, linkResults, format.LinkResult{Link: nil, Err: err})
return
}
err = dag.Walk(ctx, getLinks, rootNode.Cid(), cset.Visit, dag.Concurrent())
err := dag.Walk(ctx, getLinks, packBytesIntoCID(nil), cset.Visit, dag.Concurrent())
Comment on lines -377 to +381
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note removal of serialization here. Will add an explicit test.

if err != nil {
emitResult(ctx, linkResults, format.LinkResult{Link: nil, Err: err})
}
}()
return linkResults
}

// makeAsyncTrieGetLinks builds a getLinks function that can be used with EnumerateChildrenAsync
// to iterate a HAMT shard. It takes an IPLD Dag Service to fetch nodes, and a call back that will get called
// on all links to leaf nodes in a HAMT tree, so they can be collected for an EnumLinks operation
func makeAsyncTrieGetLinks(dagService ipld.DAGService, linkResults chan<- format.LinkResult) dag.GetLinks {
// Used only as the `GetLinks` callback for `dag.Walk` to enumerate links in
// the HAMT structure (`EnumLinksAsync`). It is bind to the root `Shard`
// node being enumerated (using its DAG service and traversing always
// from that node).
// FIXME: To traverse the HAMT in memory (not only its links but its
// child Shard nodes loaded in memory, `node.childer.children`) we are
// **abusing the CID** passing arbitrary bytes through it which enocde
// the position of a specific node in the HAMT which we will scan for its
// children.
func getLinksHAMTNode(root *Shard, linkResults chan<- format.LinkResult) dag.GetLinks {
// Traverse the HAMT position encoded in the CID to find the `node`
// we are fetching.
goToNodeFromCID := func(ctx context.Context, pos *hamtPos) (*Shard, error) {
node := root
for depth := range pos.childSlicePos {
sliceIndex := int(pos.childSlicePos[depth])
var err error
node, err = node.childer.get(ctx, sliceIndex)
if err != nil {
return nil, err
}
}
return node, nil
}

emitValueLink := func(ctx context.Context, name string, link *ipld.Link) {
formattedLink := *link
formattedLink.Name = name
emitResult(ctx, linkResults, format.LinkResult{Link: &formattedLink, Err: nil})
}

packShardPosAsLinkCID := func(currentPos *hamtPos, newSliceIndex int) *ipld.Link {
return &ipld.Link{Cid: packBytesIntoCID(posToBytes(currentPos.addChildPos(newSliceIndex)))}
}

return func(ctx context.Context, currentCid cid.Cid) ([]*ipld.Link, error) {
node, err := dagService.Get(ctx, currentCid)
if err != nil {
return nil, err
}
directoryShard, err := NewHamtFromDag(dagService, node)
pos := posFromBytes(unpackBytesFromCID(currentCid))
node, err := goToNodeFromCID(ctx, pos)
if err != nil {
return nil, err
}

childShards := make([]*ipld.Link, 0, directoryShard.childer.length())
links := directoryShard.childer.links
for idx := range directoryShard.childer.children {
lnk := links[idx]
lnkLinkType, err := directoryShard.childLinkType(lnk)
childShards := make([]*ipld.Link, 0, node.childer.length())
for sliceIndex := range node.childer.children {
newPos := packShardPosAsLinkCID(pos, sliceIndex)

if err != nil {
return nil, err
}
if lnkLinkType == shardLink {
childShards = append(childShards, lnk)
if inMemoryChild := node.childer.children[sliceIndex]; inMemoryChild != nil {
// The child is already loaded in memory.
if inMemoryChild.isValueNode() {
emitValueLink(ctx, inMemoryChild.key, inMemoryChild.val)
} else {
childShards = append(childShards, newPos)
}
} else {
sv, err := directoryShard.makeShardValue(lnk)
// The child is not loaded in memory but is instead stored
// in the DAG service and referenced here though its link.
link := node.childer.links[sliceIndex]
lnkLinkType, err := node.childLinkType(link)
if err != nil {
return nil, err
}
formattedLink := sv.val
formattedLink.Name = sv.key
emitResult(ctx, linkResults, format.LinkResult{Link: formattedLink, Err: nil})
if lnkLinkType == shardValueLink {
emitValueLink(ctx, node.extractLinkName(link), link)
} else {
childShards = append(childShards, newPos)
}
}
}

return childShards, nil
}
}
Expand Down Expand Up @@ -596,7 +628,8 @@ type childer struct {
dserv ipld.DAGService
bitfield bitfield.Bitfield

// Only one of links/children will be non-nil for every child/link.
// Only one of links/children will be non-nil for every child/link,
// but they have always the same length.
links []*ipld.Link
children []*Shard
}
Expand Down
47 changes: 47 additions & 0 deletions hamt/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import (

"github.com/ipfs/go-unixfs/internal"

"github.com/ipfs/go-cid"
"github.com/multiformats/go-multihash"
"github.com/spaolacci/murmur3"
)

Expand Down Expand Up @@ -78,3 +80,48 @@ func murmur3Hash(val []byte) []byte {
h.Write(val)
return h.Sum(nil)
}

// We convert CIDs to arbitrary strings as CIDv1 raw identity hashes.
func packBytesIntoCID(data []byte) cid.Cid {
h, err := multihash.Sum(data, multihash.IDENTITY, -1)
if err != nil {
panic(fmt.Sprintf("error while packing data %s: %s",
data, err))
}
return cid.NewCidV1(cid.Raw, h)
}

func unpackBytesFromCID(c cid.Cid) []byte {
if c.Version() != 1 {
panic("trying to unpack CIDv0")
}
decoded, err := multihash.Decode(c.Hash())
if err != nil {
panic(fmt.Sprintf("error while unpacking data from CID %s: %s",
c, err))
}
return decoded.Digest
}

// Coordinate in a HAMT to single out a single node.
// For simplicity this is independent of the hash the node encodes and just
// the position in the nodes.
// See `getLinksHAMTNode` for details.
type hamtPos struct {
// Slice of child positions (from the immediate parent perspective).
// FIXME: We are assuming a shard width <= 256 (the default) to easily
// convert to and from a byte slice.
Comment on lines +112 to +113
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Working on this (extending the hashBits structure to see if it naturally fits into the HAMT API).

childSlicePos []byte
}

func (hp *hamtPos) addChildPos(childIndex int) *hamtPos {
return &hamtPos{ append(hp.childSlicePos, byte(childIndex))}
}

func posFromBytes(data []byte) *hamtPos{
return &hamtPos{data}
}

func posToBytes(hp *hamtPos) []byte {
return hp.childSlicePos
}