From d3cc61895109fcb267c0b6da81606c0498c82751 Mon Sep 17 00:00:00 2001
From: Martin HS <martin@swende.se>
Date: Thu, 23 Jan 2025 10:17:12 +0100
Subject: [PATCH] trie: reduce allocations in stacktrie (#30743)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR uses various tweaks and tricks to make the stacktrie near
alloc-free.

```
[user@work go-ethereum]$ benchstat stacktrie.1 stacktrie.7
goos: linux
goarch: amd64
pkg: github.com/ethereum/go-ethereum/trie
cpu: 12th Gen Intel(R) Core(TM) i7-1270P
             │ stacktrie.1  │             stacktrie.7              │
             │    sec/op    │    sec/op     vs base                │
Insert100K-8   106.97m ± 8%   88.21m ± 34%  -17.54% (p=0.000 n=10)

             │   stacktrie.1    │             stacktrie.7              │
             │       B/op       │     B/op      vs base                │
Insert100K-8   13199.608Ki ± 0%   3.424Ki ± 3%  -99.97% (p=0.000 n=10)

             │  stacktrie.1   │             stacktrie.7             │
             │   allocs/op    │ allocs/op   vs base                 │
Insert100K-8   553428.50 ± 0%   22.00 ± 5%  -100.00% (p=0.000 n=10)
```
Also improves derivesha:
```
goos: linux
goarch: amd64
pkg: github.com/ethereum/go-ethereum/core/types
cpu: 12th Gen Intel(R) Core(TM) i7-1270P
                          │ derivesha.1 │             derivesha.2              │
                          │   sec/op    │    sec/op     vs base                │
DeriveSha200/stack_trie-8   477.8µ ± 2%   430.0µ ± 12%  -10.00% (p=0.000 n=10)

                          │ derivesha.1  │             derivesha.2              │
                          │     B/op     │     B/op      vs base                │
DeriveSha200/stack_trie-8   45.17Ki ± 0%   25.65Ki ± 0%  -43.21% (p=0.000 n=10)

                          │ derivesha.1 │            derivesha.2             │
                          │  allocs/op  │ allocs/op   vs base                │
DeriveSha200/stack_trie-8   1259.0 ± 0%   232.0 ± 0%  -81.57% (p=0.000 n=10)

```

---------

Co-authored-by: Gary Rong <garyrong0905@gmail.com>
---
 trie/bytepool.go       | 64 ++++++++++++++++++++++++++++++++++++
 trie/encoding.go       | 12 +++++++
 trie/hasher.go         |  8 +++++
 trie/node.go           | 35 ++++++++++++--------
 trie/node_enc.go       | 39 +++++++++++++++++++---
 trie/stacktrie.go      | 73 +++++++++++++++++++++++++++++-------------
 trie/stacktrie_test.go | 46 ++++++++++++++++++++++++++
 7 files changed, 238 insertions(+), 39 deletions(-)
 create mode 100644 trie/bytepool.go

diff --git a/trie/bytepool.go b/trie/bytepool.go
new file mode 100644
index 000000000000..4f9c5672fd9b
--- /dev/null
+++ b/trie/bytepool.go
@@ -0,0 +1,64 @@
+// Copyright 2024 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+package trie
+
+// bytesPool is a pool for byte slices. It is safe for concurrent use.
+type bytesPool struct {
+	c chan []byte
+	w int
+}
+
+// newBytesPool creates a new bytesPool. The sliceCap sets the capacity of
+// newly allocated slices, and the nitems determines how many items the pool
+// will hold, at maximum.
+func newBytesPool(sliceCap, nitems int) *bytesPool {
+	return &bytesPool{
+		c: make(chan []byte, nitems),
+		w: sliceCap,
+	}
+}
+
+// Get returns a slice. Safe for concurrent use.
+func (bp *bytesPool) Get() []byte {
+	select {
+	case b := <-bp.c:
+		return b
+	default:
+		return make([]byte, 0, bp.w)
+	}
+}
+
+// GetWithSize returns a slice with specified byte slice size.
+func (bp *bytesPool) GetWithSize(s int) []byte {
+	b := bp.Get()
+	if cap(b) < s {
+		return make([]byte, s)
+	}
+	return b[:s]
+}
+
+// Put returns a slice to the pool. Safe for concurrent use. This method
+// will ignore slices that are too small or too large (>3x the cap)
+func (bp *bytesPool) Put(b []byte) {
+	if c := cap(b); c < bp.w || c > 3*bp.w {
+		return
+	}
+	select {
+	case bp.c <- b:
+	default:
+	}
+}
diff --git a/trie/encoding.go b/trie/encoding.go
index 3284d3f8f021..4cd29f531afd 100644
--- a/trie/encoding.go
+++ b/trie/encoding.go
@@ -104,6 +104,18 @@ func keybytesToHex(str []byte) []byte {
 	return nibbles
 }
 
+// writeHexKey writes the hexkey into the given slice.
+// OBS! This method omits the termination flag.
+// OBS! The dst slice must be at least 2x as large as the key
+func writeHexKey(dst []byte, key []byte) []byte {
+	_ = dst[2*len(key)-1]
+	for i, b := range key {
+		dst[i*2] = b / 16
+		dst[i*2+1] = b % 16
+	}
+	return dst[:2*len(key)]
+}
+
 // hexToKeybytes turns hex nibbles into key bytes.
 // This can only be used for keys of even length.
 func hexToKeybytes(hex []byte) []byte {
diff --git a/trie/hasher.go b/trie/hasher.go
index abf654c709cf..28f7f3d0c387 100644
--- a/trie/hasher.go
+++ b/trie/hasher.go
@@ -188,6 +188,14 @@ func (h *hasher) hashData(data []byte) hashNode {
 	return n
 }
 
+// hashDataTo hashes the provided data to the given destination buffer. The caller
+// must ensure that the dst buffer is of appropriate size.
+func (h *hasher) hashDataTo(dst, data []byte) {
+	h.sha.Reset()
+	h.sha.Write(data)
+	h.sha.Read(dst)
+}
+
 // proofHash is used to construct trie proofs, and returns the 'collapsed'
 // node (for later RLP encoding) as well as the hashed node -- unless the
 // node is smaller than 32 bytes, in which case it will be returned as is.
diff --git a/trie/node.go b/trie/node.go
index 15bbf62f1c93..ecc2de192d3b 100644
--- a/trie/node.go
+++ b/trie/node.go
@@ -45,6 +45,27 @@ type (
 	}
 	hashNode  []byte
 	valueNode []byte
+
+	// fullnodeEncoder is a type used exclusively for encoding fullNode.
+	// Briefly instantiating a fullnodeEncoder and initializing with
+	// existing slices is less memory intense than using the fullNode type.
+	fullnodeEncoder struct {
+		Children [17][]byte
+	}
+
+	// extNodeEncoder is a type used exclusively for encoding extension node.
+	// Briefly instantiating a extNodeEncoder and initializing with existing
+	// slices is less memory intense than using the shortNode type.
+	extNodeEncoder struct {
+		Key []byte
+		Val []byte
+	}
+
+	// leafNodeEncoder is a type used exclusively for encoding leaf node.
+	leafNodeEncoder struct {
+		Key []byte
+		Val []byte
+	}
 )
 
 // nilValueNode is used when collapsing internal trie nodes for hashing, since
@@ -89,6 +110,7 @@ func (n *fullNode) fstring(ind string) string {
 	}
 	return resp + fmt.Sprintf("\n%s] ", ind)
 }
+
 func (n *shortNode) fstring(ind string) string {
 	return fmt.Sprintf("{%x: %v} ", n.Key, n.Val.fstring(ind+"  "))
 }
@@ -99,19 +121,6 @@ func (n valueNode) fstring(ind string) string {
 	return fmt.Sprintf("%x ", []byte(n))
 }
 
-// rawNode is a simple binary blob used to differentiate between collapsed trie
-// nodes and already encoded RLP binary blobs (while at the same time store them
-// in the same cache fields).
-type rawNode []byte
-
-func (n rawNode) cache() (hashNode, bool)   { panic("this should never end up in a live trie") }
-func (n rawNode) fstring(ind string) string { panic("this should never end up in a live trie") }
-
-func (n rawNode) EncodeRLP(w io.Writer) error {
-	_, err := w.Write(n)
-	return err
-}
-
 // mustDecodeNode is a wrapper of decodeNode and panic if any error is encountered.
 func mustDecodeNode(hash, buf []byte) node {
 	n, err := decodeNode(hash, buf)
diff --git a/trie/node_enc.go b/trie/node_enc.go
index 1b2eca682f0b..c95587eeabb7 100644
--- a/trie/node_enc.go
+++ b/trie/node_enc.go
@@ -40,6 +40,20 @@ func (n *fullNode) encode(w rlp.EncoderBuffer) {
 	w.ListEnd(offset)
 }
 
+func (n *fullnodeEncoder) encode(w rlp.EncoderBuffer) {
+	offset := w.List()
+	for _, c := range n.Children {
+		if c == nil {
+			w.Write(rlp.EmptyString)
+		} else if len(c) < 32 {
+			w.Write(c) // rawNode
+		} else {
+			w.WriteBytes(c) // hashNode
+		}
+	}
+	w.ListEnd(offset)
+}
+
 func (n *shortNode) encode(w rlp.EncoderBuffer) {
 	offset := w.List()
 	w.WriteBytes(n.Key)
@@ -51,6 +65,27 @@ func (n *shortNode) encode(w rlp.EncoderBuffer) {
 	w.ListEnd(offset)
 }
 
+func (n *extNodeEncoder) encode(w rlp.EncoderBuffer) {
+	offset := w.List()
+	w.WriteBytes(n.Key)
+
+	if n.Val == nil {
+		w.Write(rlp.EmptyString)
+	} else if len(n.Val) < 32 {
+		w.Write(n.Val) // rawNode
+	} else {
+		w.WriteBytes(n.Val) // hashNode
+	}
+	w.ListEnd(offset)
+}
+
+func (n *leafNodeEncoder) encode(w rlp.EncoderBuffer) {
+	offset := w.List()
+	w.WriteBytes(n.Key) // Compact format key
+	w.WriteBytes(n.Val) // Value node, must be non-nil
+	w.ListEnd(offset)
+}
+
 func (n hashNode) encode(w rlp.EncoderBuffer) {
 	w.WriteBytes(n)
 }
@@ -58,7 +93,3 @@ func (n hashNode) encode(w rlp.EncoderBuffer) {
 func (n valueNode) encode(w rlp.EncoderBuffer) {
 	w.WriteBytes(n)
 }
-
-func (n rawNode) encode(w rlp.EncoderBuffer) {
-	w.Write(n)
-}
diff --git a/trie/stacktrie.go b/trie/stacktrie.go
index d194cbf0aec4..2b7366c3c514 100644
--- a/trie/stacktrie.go
+++ b/trie/stacktrie.go
@@ -27,6 +27,7 @@ import (
 
 var (
 	stPool = sync.Pool{New: func() any { return new(stNode) }}
+	bPool  = newBytesPool(32, 100)
 	_      = types.TrieHasher((*StackTrie)(nil))
 )
 
@@ -47,6 +48,8 @@ type StackTrie struct {
 	h          *hasher
 	last       []byte
 	onTrieNode OnTrieNode
+	kBuf       []byte // buf space used for hex-key during insertions
+	pBuf       []byte // buf space used for path during insertions
 }
 
 // NewStackTrie allocates and initializes an empty trie. The committed nodes
@@ -56,6 +59,17 @@ func NewStackTrie(onTrieNode OnTrieNode) *StackTrie {
 		root:       stPool.Get().(*stNode),
 		h:          newHasher(false),
 		onTrieNode: onTrieNode,
+		kBuf:       make([]byte, 64),
+		pBuf:       make([]byte, 64),
+	}
+}
+
+func (t *StackTrie) grow(key []byte) {
+	if cap(t.kBuf) < 2*len(key) {
+		t.kBuf = make([]byte, 2*len(key))
+	}
+	if cap(t.pBuf) < 2*len(key) {
+		t.pBuf = make([]byte, 2*len(key))
 	}
 }
 
@@ -64,7 +78,8 @@ func (t *StackTrie) Update(key, value []byte) error {
 	if len(value) == 0 {
 		return errors.New("trying to insert empty (deletion)")
 	}
-	k := t.TrieKey(key)
+	t.grow(key)
+	k := writeHexKey(t.kBuf, key)
 	if bytes.Compare(t.last, k) >= 0 {
 		return errors.New("non-ascending key order")
 	}
@@ -73,7 +88,7 @@ func (t *StackTrie) Update(key, value []byte) error {
 	} else {
 		t.last = append(t.last[:0], k...) // reuse key slice
 	}
-	t.insert(t.root, k, value, nil)
+	t.insert(t.root, k, value, t.pBuf[:0])
 	return nil
 }
 
@@ -129,6 +144,12 @@ const (
 )
 
 func (n *stNode) reset() *stNode {
+	if n.typ == hashedNode {
+		// On hashnodes, we 'own' the val: it is guaranteed to be not held
+		// by external caller. Hence, when we arrive here, we can put it back
+		// into the pool
+		bPool.Put(n.val)
+	}
 	n.key = n.key[:0]
 	n.val = nil
 	for i := range n.children {
@@ -150,8 +171,12 @@ func (n *stNode) getDiffIndex(key []byte) int {
 	return len(n.key)
 }
 
-// Helper function to that inserts a (key, value) pair into
-// the trie.
+// Helper function to that inserts a (key, value) pair into the trie.
+//
+//   - The key is not retained by this method, but always copied if needed.
+//   - The value is retained by this method, as long as the leaf that it represents
+//     remains unhashed. However: it is never modified.
+//   - The path is not retained by this method.
 func (t *StackTrie) insert(st *stNode, key, value []byte, path []byte) {
 	switch st.typ {
 	case branchNode: /* Branch */
@@ -283,7 +308,7 @@ func (t *StackTrie) insert(st *stNode, key, value []byte, path []byte) {
 
 	case emptyNode: /* Empty */
 		st.typ = leafNode
-		st.key = key
+		st.key = append(st.key, key...) // deep-copy the key as it's volatile
 		st.val = value
 
 	case hashedNode:
@@ -318,35 +343,33 @@ func (t *StackTrie) hash(st *stNode, path []byte) {
 		return
 
 	case branchNode:
-		var nodes fullNode
+		var nodes fullnodeEncoder
 		for i, child := range st.children {
 			if child == nil {
-				nodes.Children[i] = nilValueNode
 				continue
 			}
 			t.hash(child, append(path, byte(i)))
+			nodes.Children[i] = child.val
+		}
+		nodes.encode(t.h.encbuf)
+		blob = t.h.encodedBytes()
 
-			if len(child.val) < 32 {
-				nodes.Children[i] = rawNode(child.val)
-			} else {
-				nodes.Children[i] = hashNode(child.val)
+		for i, child := range st.children {
+			if child == nil {
+				continue
 			}
 			st.children[i] = nil
 			stPool.Put(child.reset()) // Release child back to pool.
 		}
-		nodes.encode(t.h.encbuf)
-		blob = t.h.encodedBytes()
 
 	case extNode:
 		// recursively hash and commit child as the first step
 		t.hash(st.children[0], append(path, st.key...))
 
 		// encode the extension node
-		n := shortNode{Key: hexToCompactInPlace(st.key)}
-		if len(st.children[0].val) < 32 {
-			n.Val = rawNode(st.children[0].val)
-		} else {
-			n.Val = hashNode(st.children[0].val)
+		n := extNodeEncoder{
+			Key: hexToCompactInPlace(st.key),
+			Val: st.children[0].val,
 		}
 		n.encode(t.h.encbuf)
 		blob = t.h.encodedBytes()
@@ -356,8 +379,10 @@ func (t *StackTrie) hash(st *stNode, path []byte) {
 
 	case leafNode:
 		st.key = append(st.key, byte(16))
-		n := shortNode{Key: hexToCompactInPlace(st.key), Val: valueNode(st.val)}
-
+		n := leafNodeEncoder{
+			Key: hexToCompactInPlace(st.key),
+			Val: st.val,
+		}
 		n.encode(t.h.encbuf)
 		blob = t.h.encodedBytes()
 
@@ -368,15 +393,19 @@ func (t *StackTrie) hash(st *stNode, path []byte) {
 	st.typ = hashedNode
 	st.key = st.key[:0]
 
+	st.val = nil // Release reference to potentially externally held slice.
+
 	// Skip committing the non-root node if the size is smaller than 32 bytes
 	// as tiny nodes are always embedded in their parent except root node.
 	if len(blob) < 32 && len(path) > 0 {
-		st.val = common.CopyBytes(blob)
+		st.val = bPool.GetWithSize(len(blob))
+		copy(st.val, blob)
 		return
 	}
 	// Write the hash to the 'val'. We allocate a new val here to not mutate
 	// input values.
-	st.val = t.h.hashData(blob)
+	st.val = bPool.GetWithSize(32)
+	t.h.hashDataTo(st.val, blob)
 
 	// Invoke the callback it's provided. Notably, the path and blob slices are
 	// volatile, please deep-copy the slices in callback if the contents need
diff --git a/trie/stacktrie_test.go b/trie/stacktrie_test.go
index f053b5112d3f..7e342e64bf4b 100644
--- a/trie/stacktrie_test.go
+++ b/trie/stacktrie_test.go
@@ -18,6 +18,7 @@ package trie
 
 import (
 	"bytes"
+	"encoding/binary"
 	"math/big"
 	"testing"
 
@@ -398,3 +399,48 @@ func TestStackTrieErrors(t *testing.T) {
 	assert.NotNil(t, s.Update([]byte{0x10}, []byte{0xb}), "out of order insert")
 	assert.NotNil(t, s.Update([]byte{0xaa}, []byte{0xb}), "repeat insert same key")
 }
+
+func BenchmarkInsert100K(b *testing.B) {
+	var num = 100_000
+	var key = make([]byte, 8)
+	var val = make([]byte, 20)
+	var hash common.Hash
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		s := NewStackTrie(nil)
+		var k uint64
+		for j := 0; j < num; j++ {
+			binary.BigEndian.PutUint64(key, k)
+			if err := s.Update(key, val); err != nil {
+				b.Fatal(err)
+			}
+			k += 1024
+		}
+		if hash == (common.Hash{}) {
+			hash = s.Hash()
+		} else {
+			if hash != s.Hash() && false {
+				b.Fatalf("hash wrong, have %x want %x", s.Hash(), hash)
+			}
+		}
+	}
+}
+
+func TestInsert100K(t *testing.T) {
+	var num = 100_000
+	var key = make([]byte, 8)
+	var val = make([]byte, 20)
+	s := NewStackTrie(nil)
+	var k uint64
+	for j := 0; j < num; j++ {
+		binary.BigEndian.PutUint64(key, k)
+		if err := s.Update(key, val); err != nil {
+			t.Fatal(err)
+		}
+		k += 1024
+	}
+	want := common.HexToHash("0xb0071bd257342925d9d8a9f002b9d2b646a35437aa8b089628ab56e428d29a1a")
+	if have := s.Hash(); have != want {
+		t.Fatalf("hash wrong, have %x want %x", have, want)
+	}
+}