From 2dbc9ba9f37b3c9581739efa2752f61ca32ba80e Mon Sep 17 00:00:00 2001 From: Stephen Buttolph Date: Fri, 29 Mar 2024 16:20:53 -0400 Subject: [PATCH] Optimize merkledb hashing (#2886) --- x/merkledb/codec.go | 62 ++++++++++----- x/merkledb/codec_test.go | 160 +++++++++++++++++++++++++++++++++++++-- x/merkledb/key.go | 2 + x/merkledb/node.go | 3 +- 4 files changed, 198 insertions(+), 29 deletions(-) diff --git a/x/merkledb/codec.go b/x/merkledb/codec.go index d98646a1f267..156a6630899c 100644 --- a/x/merkledb/codec.go +++ b/x/merkledb/codec.go @@ -5,6 +5,7 @@ package merkledb import ( "bytes" + "crypto/sha256" "encoding/binary" "errors" "io" @@ -28,11 +29,6 @@ const ( minByteSliceLen = minVarIntLen minDBNodeLen = minMaybeByteSliceLen + minVarIntLen minChildLen = minVarIntLen + minKeyLen + ids.IDLen + boolLen - - estimatedKeyLen = 64 - estimatedValueLen = 64 - // Child index, child ID - hashValuesChildLen = minVarIntLen + ids.IDLen ) var ( @@ -47,8 +43,8 @@ var ( errIntOverflow = errors.New("value overflows int") ) -// Note that bytes.Buffer.Write always returns nil, so we ignore its return -// values in all encode methods. +// Note that sha256.Write and bytes.Buffer.Write always returns nil, so we +// ignore their return values. func childSize(index byte, childEntry *child) int { // * index @@ -107,30 +103,54 @@ func encodeDBNode(n *dbNode) []byte { return buf.Bytes() } -// Returns the bytes that will be hashed to generate [n]'s ID. +// Returns the canonical hash of [n]. +// // Assumes [n] is non-nil. -func encodeHashValues(n *node) []byte { +// This method is performance critical. It is not expected to perform any memory +// allocations. +func hashNode(n *node) ids.ID { var ( - numChildren = len(n.children) - // Estimate size [hv] to prevent memory allocations - estimatedLen = minVarIntLen + numChildren*hashValuesChildLen + estimatedValueLen + estimatedKeyLen - buf = bytes.NewBuffer(make([]byte, 0, estimatedLen)) + sha = sha256.New() + hash ids.ID + // The hash length is larger than the maximum Uvarint length. This + // ensures binary.AppendUvarint doesn't perform any memory allocations. + emptyHashBuffer = hash[:0] ) - encodeUint(buf, uint64(numChildren)) + // By directly calling sha.Write rather than passing sha around as an + // io.Writer, the compiler can perform sufficient escape analysis to avoid + // allocating buffers on the heap. + _, _ = sha.Write(binary.AppendUvarint(emptyHashBuffer, uint64(len(n.children)))) - // ensure that the order of entries is consistent - keys := maps.Keys(n.children) + // By allocating BranchFactorLargest rather than len(n.children), this slice + // is allocated on the stack rather than the heap. BranchFactorLargest is + // at least len(n.children) which avoids memory allocations. + keys := make([]byte, 0, BranchFactorLargest) + for k := range n.children { + keys = append(keys, k) + } + + // Ensure that the order of entries is correct. slices.Sort(keys) for _, index := range keys { entry := n.children[index] - encodeUint(buf, uint64(index)) - _, _ = buf.Write(entry.id[:]) + _, _ = sha.Write(binary.AppendUvarint(emptyHashBuffer, uint64(index))) + _, _ = sha.Write(entry.id[:]) } - encodeMaybeByteSlice(buf, n.valueDigest) - encodeKeyToBuffer(buf, n.key) - return buf.Bytes() + if n.valueDigest.HasValue() { + _, _ = sha.Write(trueBytes) + value := n.valueDigest.Value() + _, _ = sha.Write(binary.AppendUvarint(emptyHashBuffer, uint64(len(value)))) + _, _ = sha.Write(value) + } else { + _, _ = sha.Write(falseBytes) + } + + _, _ = sha.Write(binary.AppendUvarint(emptyHashBuffer, uint64(n.key.length))) + _, _ = sha.Write(n.key.Bytes()) + sha.Sum(emptyHashBuffer) + return hash } // Assumes [n] is non-nil. diff --git a/x/merkledb/codec_test.go b/x/merkledb/codec_test.go index 685c6453471a..171909d51c58 100644 --- a/x/merkledb/codec_test.go +++ b/x/merkledb/codec_test.go @@ -178,8 +178,8 @@ func TestCodecDecodeDBNode_TooShort(t *testing.T) { require.ErrorIs(err, io.ErrUnexpectedEOF) } -// Ensure that encodeHashValues is deterministic -func FuzzEncodeHashValues(f *testing.F) { +// Ensure that hashNode is deterministic +func FuzzHashNode(f *testing.F) { f.Fuzz( func( t *testing.T, @@ -222,17 +222,94 @@ func FuzzEncodeHashValues(f *testing.F) { }, } - // Serialize hv multiple times - hvBytes1 := encodeHashValues(hv) - hvBytes2 := encodeHashValues(hv) + // Hash hv multiple times + hash1 := hashNode(hv) + hash2 := hashNode(hv) // Make sure they're the same - require.Equal(hvBytes1, hvBytes2) + require.Equal(hash1, hash2) } }, ) } +func TestHashNode(t *testing.T) { + tests := []struct { + name string + n *node + expectedHash string + }{ + { + name: "empty node", + n: newNode(Key{}), + expectedHash: "rbhtxoQ1DqWHvb6w66BZdVyjmPAneZUSwQq9uKj594qvFSdav", + }, + { + name: "has value", + n: func() *node { + n := newNode(Key{}) + n.setValue(maybe.Some([]byte("value1"))) + return n + }(), + expectedHash: "2vx2xueNdWoH2uB4e8hbMU5jirtZkZ1c3ePCWDhXYaFRHpCbnQ", + }, + { + name: "has key", + n: newNode(ToKey([]byte{0, 1, 2, 3, 4, 5, 6, 7})), + expectedHash: "2vA8ggXajhFEcgiF8zHTXgo8T2ALBFgffp1xfn48JEni1Uj5uK", + }, + { + name: "1 child", + n: func() *node { + n := newNode(Key{}) + childNode := newNode(ToKey([]byte{255})) + childNode.setValue(maybe.Some([]byte("value1"))) + n.addChildWithID(childNode, 4, hashNode(childNode)) + return n + }(), + expectedHash: "YfJRufqUKBv9ez6xZx6ogpnfDnw9fDsyebhYDaoaH57D3vRu3", + }, + { + name: "2 children", + n: func() *node { + n := newNode(Key{}) + + childNode1 := newNode(ToKey([]byte{255})) + childNode1.setValue(maybe.Some([]byte("value1"))) + + childNode2 := newNode(ToKey([]byte{237})) + childNode2.setValue(maybe.Some([]byte("value2"))) + + n.addChildWithID(childNode1, 4, hashNode(childNode1)) + n.addChildWithID(childNode2, 4, hashNode(childNode2)) + return n + }(), + expectedHash: "YVmbx5MZtSKuYhzvHnCqGrswQcxmozAkv7xE1vTA2EiGpWUkv", + }, + { + name: "16 children", + n: func() *node { + n := newNode(Key{}) + + for i := byte(0); i < 16; i++ { + childNode := newNode(ToKey([]byte{i << 4})) + childNode.setValue(maybe.Some([]byte("some value"))) + + n.addChildWithID(childNode, 4, hashNode(childNode)) + } + return n + }(), + expectedHash: "5YiFLL7QV3f441See9uWePi3wVKsx9fgvX5VPhU8PRxtLqhwY", + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + hash := hashNode(test.n) + require.Equal(t, test.expectedHash, hash.String()) + }) + } +} + func TestCodecDecodeKeyLengthOverflowRegression(t *testing.T) { _, err := decodeKey(binary.AppendUvarint(nil, math.MaxInt)) require.ErrorIs(t, err, io.ErrUnexpectedEOF) @@ -258,6 +335,77 @@ func TestUintSize(t *testing.T) { } } +func Benchmark_HashNode(b *testing.B) { + benchmarks := []struct { + name string + n *node + }{ + { + name: "empty node", + n: newNode(Key{}), + }, + { + name: "has value", + n: func() *node { + n := newNode(Key{}) + n.setValue(maybe.Some([]byte("value1"))) + return n + }(), + }, + { + name: "has key", + n: newNode(ToKey([]byte{0, 1, 2, 3, 4, 5, 6, 7})), + }, + { + name: "1 child", + n: func() *node { + n := newNode(Key{}) + childNode := newNode(ToKey([]byte{255})) + childNode.setValue(maybe.Some([]byte("value1"))) + n.addChildWithID(childNode, 4, hashNode(childNode)) + return n + }(), + }, + { + name: "2 children", + n: func() *node { + n := newNode(Key{}) + + childNode1 := newNode(ToKey([]byte{255})) + childNode1.setValue(maybe.Some([]byte("value1"))) + + childNode2 := newNode(ToKey([]byte{237})) + childNode2.setValue(maybe.Some([]byte("value2"))) + + n.addChildWithID(childNode1, 4, hashNode(childNode1)) + n.addChildWithID(childNode2, 4, hashNode(childNode2)) + return n + }(), + }, + { + name: "16 children", + n: func() *node { + n := newNode(Key{}) + + for i := byte(0); i < 16; i++ { + childNode := newNode(ToKey([]byte{i << 4})) + childNode.setValue(maybe.Some([]byte("some value"))) + + n.addChildWithID(childNode, 4, hashNode(childNode)) + } + return n + }(), + }, + } + for _, benchmark := range benchmarks { + b.Run(benchmark.name, func(b *testing.B) { + for i := 0; i < b.N; i++ { + hashNode(benchmark.n) + } + }) + } +} + func Benchmark_EncodeUint(b *testing.B) { var dst bytes.Buffer dst.Grow(binary.MaxVarintLen64) diff --git a/x/merkledb/key.go b/x/merkledb/key.go index 524c95bb2dae..9febe3313875 100644 --- a/x/merkledb/key.go +++ b/x/merkledb/key.go @@ -48,6 +48,8 @@ const ( BranchFactor4 = BranchFactor(4) BranchFactor16 = BranchFactor(16) BranchFactor256 = BranchFactor(256) + + BranchFactorLargest = BranchFactor256 ) // Valid checks if BranchFactor [b] is one of the predefined valid options for BranchFactor diff --git a/x/merkledb/node.go b/x/merkledb/node.go index 60b5cecbb3a6..c6498322f0c4 100644 --- a/x/merkledb/node.go +++ b/x/merkledb/node.go @@ -70,8 +70,7 @@ func (n *node) bytes() []byte { // Returns and caches the ID of this node. func (n *node) calculateID(metrics merkleMetrics) ids.ID { metrics.HashCalculated() - bytes := encodeHashValues(n) - return hashing.ComputeHash256Array(bytes) + return hashNode(n) } // Set [n]'s value to [val].