Optimize merkledb hashing (#2886)

ava-labs · Mar 29, 2024 · 2dbc9ba · 2dbc9ba
1 parent d046f29
commit 2dbc9ba
Show file tree

Hide file tree

Showing 4 changed files with 198 additions and 29 deletions.
diff --git a/x/merkledb/codec.go b/x/merkledb/codec.go
@@ -5,6 +5,7 @@ package merkledb
 
 import (
 	"bytes"
+	"crypto/sha256"
 	"encoding/binary"
 	"errors"
 	"io"
@@ -28,11 +29,6 @@ const (
 	minByteSliceLen      = minVarIntLen
 	minDBNodeLen         = minMaybeByteSliceLen + minVarIntLen
 	minChildLen          = minVarIntLen + minKeyLen + ids.IDLen + boolLen
-
-	estimatedKeyLen   = 64
-	estimatedValueLen = 64
-	// Child index, child ID
-	hashValuesChildLen = minVarIntLen + ids.IDLen
 )
 
 var (
@@ -47,8 +43,8 @@ var (
 	errIntOverflow        = errors.New("value overflows int")
 )
 
-// Note that bytes.Buffer.Write always returns nil, so we ignore its return
-// values in all encode methods.
+// Note that sha256.Write and bytes.Buffer.Write always returns nil, so we
+// ignore their return values.
 
 func childSize(index byte, childEntry *child) int {
 	// * index
@@ -107,30 +103,54 @@ func encodeDBNode(n *dbNode) []byte {
 	return buf.Bytes()
 }
 
-// Returns the bytes that will be hashed to generate [n]'s ID.
+// Returns the canonical hash of [n].
+//
 // Assumes [n] is non-nil.
-func encodeHashValues(n *node) []byte {
+// This method is performance critical. It is not expected to perform any memory
+// allocations.
+func hashNode(n *node) ids.ID {
 	var (
-		numChildren = len(n.children)
-		// Estimate size [hv] to prevent memory allocations
-		estimatedLen = minVarIntLen + numChildren*hashValuesChildLen + estimatedValueLen + estimatedKeyLen
-		buf          = bytes.NewBuffer(make([]byte, 0, estimatedLen))
+		sha  = sha256.New()
+		hash ids.ID
+		// The hash length is larger than the maximum Uvarint length. This
+		// ensures binary.AppendUvarint doesn't perform any memory allocations.
+		emptyHashBuffer = hash[:0]
 	)
 
-	encodeUint(buf, uint64(numChildren))
+	// By directly calling sha.Write rather than passing sha around as an
+	// io.Writer, the compiler can perform sufficient escape analysis to avoid
+	// allocating buffers on the heap.
+	_, _ = sha.Write(binary.AppendUvarint(emptyHashBuffer, uint64(len(n.children))))
 
-	// ensure that the order of entries is consistent
-	keys := maps.Keys(n.children)
+	// By allocating BranchFactorLargest rather than len(n.children), this slice
+	// is allocated on the stack rather than the heap. BranchFactorLargest is
+	// at least len(n.children) which avoids memory allocations.
+	keys := make([]byte, 0, BranchFactorLargest)
+	for k := range n.children {
+		keys = append(keys, k)
+	}
+
+	// Ensure that the order of entries is correct.
 	slices.Sort(keys)
 	for _, index := range keys {
 		entry := n.children[index]
-		encodeUint(buf, uint64(index))
-		_, _ = buf.Write(entry.id[:])
+		_, _ = sha.Write(binary.AppendUvarint(emptyHashBuffer, uint64(index)))
+		_, _ = sha.Write(entry.id[:])
 	}
-	encodeMaybeByteSlice(buf, n.valueDigest)
-	encodeKeyToBuffer(buf, n.key)
 
-	return buf.Bytes()
+	if n.valueDigest.HasValue() {
+		_, _ = sha.Write(trueBytes)
+		value := n.valueDigest.Value()
+		_, _ = sha.Write(binary.AppendUvarint(emptyHashBuffer, uint64(len(value))))
+		_, _ = sha.Write(value)
+	} else {
+		_, _ = sha.Write(falseBytes)
+	}
+
+	_, _ = sha.Write(binary.AppendUvarint(emptyHashBuffer, uint64(n.key.length)))
+	_, _ = sha.Write(n.key.Bytes())
+	sha.Sum(emptyHashBuffer)
+	return hash
 }
 
 // Assumes [n] is non-nil.

diff --git a/x/merkledb/codec_test.go b/x/merkledb/codec_test.go
@@ -178,8 +178,8 @@ func TestCodecDecodeDBNode_TooShort(t *testing.T) {
 	require.ErrorIs(err, io.ErrUnexpectedEOF)
 }
 
-// Ensure that encodeHashValues is deterministic
-func FuzzEncodeHashValues(f *testing.F) {
+// Ensure that hashNode is deterministic
+func FuzzHashNode(f *testing.F) {
 	f.Fuzz(
 		func(
 			t *testing.T,
@@ -222,17 +222,94 @@ func FuzzEncodeHashValues(f *testing.F) {
 					},
 				}
 
-				// Serialize hv multiple times
-				hvBytes1 := encodeHashValues(hv)
-				hvBytes2 := encodeHashValues(hv)
+				// Hash hv multiple times
+				hash1 := hashNode(hv)
+				hash2 := hashNode(hv)
 
 				// Make sure they're the same
-				require.Equal(hvBytes1, hvBytes2)
+				require.Equal(hash1, hash2)
 			}
 		},
 	)
 }
 
+func TestHashNode(t *testing.T) {
+	tests := []struct {
+		name         string
+		n            *node
+		expectedHash string
+	}{
+		{
+			name:         "empty node",
+			n:            newNode(Key{}),
+			expectedHash: "rbhtxoQ1DqWHvb6w66BZdVyjmPAneZUSwQq9uKj594qvFSdav",
+		},
+		{
+			name: "has value",
+			n: func() *node {
+				n := newNode(Key{})
+				n.setValue(maybe.Some([]byte("value1")))
+				return n
+			}(),
+			expectedHash: "2vx2xueNdWoH2uB4e8hbMU5jirtZkZ1c3ePCWDhXYaFRHpCbnQ",
+		},
+		{
+			name:         "has key",
+			n:            newNode(ToKey([]byte{0, 1, 2, 3, 4, 5, 6, 7})),
+			expectedHash: "2vA8ggXajhFEcgiF8zHTXgo8T2ALBFgffp1xfn48JEni1Uj5uK",
+		},
+		{
+			name: "1 child",
+			n: func() *node {
+				n := newNode(Key{})
+				childNode := newNode(ToKey([]byte{255}))
+				childNode.setValue(maybe.Some([]byte("value1")))
+				n.addChildWithID(childNode, 4, hashNode(childNode))
+				return n
+			}(),
+			expectedHash: "YfJRufqUKBv9ez6xZx6ogpnfDnw9fDsyebhYDaoaH57D3vRu3",
+		},
+		{
+			name: "2 children",
+			n: func() *node {
+				n := newNode(Key{})
+
+				childNode1 := newNode(ToKey([]byte{255}))
+				childNode1.setValue(maybe.Some([]byte("value1")))
+
+				childNode2 := newNode(ToKey([]byte{237}))
+				childNode2.setValue(maybe.Some([]byte("value2")))
+
+				n.addChildWithID(childNode1, 4, hashNode(childNode1))
+				n.addChildWithID(childNode2, 4, hashNode(childNode2))
+				return n
+			}(),
+			expectedHash: "YVmbx5MZtSKuYhzvHnCqGrswQcxmozAkv7xE1vTA2EiGpWUkv",
+		},
+		{
+			name: "16 children",
+			n: func() *node {
+				n := newNode(Key{})
+
+				for i := byte(0); i < 16; i++ {
+					childNode := newNode(ToKey([]byte{i << 4}))
+					childNode.setValue(maybe.Some([]byte("some value")))
+
+					n.addChildWithID(childNode, 4, hashNode(childNode))
+				}
+				return n
+			}(),
+			expectedHash: "5YiFLL7QV3f441See9uWePi3wVKsx9fgvX5VPhU8PRxtLqhwY",
+		},
+	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			hash := hashNode(test.n)
+			require.Equal(t, test.expectedHash, hash.String())
+		})
+	}
+}
+
 func TestCodecDecodeKeyLengthOverflowRegression(t *testing.T) {
 	_, err := decodeKey(binary.AppendUvarint(nil, math.MaxInt))
 	require.ErrorIs(t, err, io.ErrUnexpectedEOF)
@@ -258,6 +335,77 @@ func TestUintSize(t *testing.T) {
 	}
 }
 
+func Benchmark_HashNode(b *testing.B) {
+	benchmarks := []struct {
+		name string
+		n    *node
+	}{
+		{
+			name: "empty node",
+			n:    newNode(Key{}),
+		},
+		{
+			name: "has value",
+			n: func() *node {
+				n := newNode(Key{})
+				n.setValue(maybe.Some([]byte("value1")))
+				return n
+			}(),
+		},
+		{
+			name: "has key",
+			n:    newNode(ToKey([]byte{0, 1, 2, 3, 4, 5, 6, 7})),
+		},
+		{
+			name: "1 child",
+			n: func() *node {
+				n := newNode(Key{})
+				childNode := newNode(ToKey([]byte{255}))
+				childNode.setValue(maybe.Some([]byte("value1")))
+				n.addChildWithID(childNode, 4, hashNode(childNode))
+				return n
+			}(),
+		},
+		{
+			name: "2 children",
+			n: func() *node {
+				n := newNode(Key{})
+
+				childNode1 := newNode(ToKey([]byte{255}))
+				childNode1.setValue(maybe.Some([]byte("value1")))
+
+				childNode2 := newNode(ToKey([]byte{237}))
+				childNode2.setValue(maybe.Some([]byte("value2")))
+
+				n.addChildWithID(childNode1, 4, hashNode(childNode1))
+				n.addChildWithID(childNode2, 4, hashNode(childNode2))
+				return n
+			}(),
+		},
+		{
+			name: "16 children",
+			n: func() *node {
+				n := newNode(Key{})
+
+				for i := byte(0); i < 16; i++ {
+					childNode := newNode(ToKey([]byte{i << 4}))
+					childNode.setValue(maybe.Some([]byte("some value")))
+
+					n.addChildWithID(childNode, 4, hashNode(childNode))
+				}
+				return n
+			}(),
+		},
+	}
+	for _, benchmark := range benchmarks {
+		b.Run(benchmark.name, func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				hashNode(benchmark.n)
+			}
+		})
+	}
+}
+
 func Benchmark_EncodeUint(b *testing.B) {
 	var dst bytes.Buffer
 	dst.Grow(binary.MaxVarintLen64)

diff --git a/x/merkledb/key.go b/x/merkledb/key.go
@@ -48,6 +48,8 @@ const (
 	BranchFactor4   = BranchFactor(4)
 	BranchFactor16  = BranchFactor(16)
 	BranchFactor256 = BranchFactor(256)
+
+	BranchFactorLargest = BranchFactor256
 )
 
 // Valid checks if BranchFactor [b] is one of the predefined valid options for BranchFactor

diff --git a/x/merkledb/node.go b/x/merkledb/node.go
@@ -70,8 +70,7 @@ func (n *node) bytes() []byte {
 // Returns and caches the ID of this node.
 func (n *node) calculateID(metrics merkleMetrics) ids.ID {
 	metrics.HashCalculated()
-	bytes := encodeHashValues(n)
-	return hashing.ComputeHash256Array(bytes)
+	return hashNode(n)
 }
 
 // Set [n]'s value to [val].