From bdc173dc40969dbb579b4bce5a649323fcc40f77 Mon Sep 17 00:00:00 2001
From: Anuraag Agrawal <anuraaga@gmail.com>
Date: Thu, 19 Oct 2023 17:04:40 +0900
Subject: [PATCH] Use typed alloc for large objects too (#27)

---
 bitmap.go       |  35 +++++++++++++
 bitmap_test.go  | 102 ++++++++++++++++++++++++++++++++++++
 finalizer.go    |   6 +--
 gc.go           |  48 ++++++++++++-----
 gc_notcustom.go |   2 +-
 intmap.go       | 135 ++++++++++++++++++++++++++++++++++++++++++++++++
 intmap_test.go  |  75 +++++++++++++++++++++++++++
 malloc.go       |  18 +++++++
 8 files changed, 404 insertions(+), 17 deletions(-)
 create mode 100644 bitmap.go
 create mode 100644 bitmap_test.go
 create mode 100644 intmap.go
 create mode 100644 intmap_test.go
 create mode 100644 malloc.go

diff --git a/bitmap.go b/bitmap.go
new file mode 100644
index 0000000..56cbe38
--- /dev/null
+++ b/bitmap.go
@@ -0,0 +1,35 @@
+// Copyright wasilibs authors
+// SPDX-License-Identifier: MIT
+
+package nottinygc
+
+import "unsafe"
+
+// CPP_WORDSZ is a simple integer constant representing the word size
+const cppWordsz = unsafe.Sizeof(uintptr(0)) * 8
+
+type gcBitmap struct {
+	words []uintptr
+}
+
+func newBitmap(size uintptr) gcBitmap {
+	bmSize := gcBitmapSize(size)
+	wordsArr := cmalloc(bmSize * unsafe.Sizeof(uintptr(0)))
+	words := unsafe.Slice((*uintptr)(wordsArr), bmSize)
+	for i := 0; i < len(words); i++ {
+		words[i] = 0
+	}
+	return gcBitmap{words: words}
+}
+
+func (b gcBitmap) set(idx uintptr) {
+	b.words[idx/cppWordsz] |= 1 << (idx % cppWordsz)
+}
+
+func (b gcBitmap) get(idx uintptr) uintptr {
+	return (b.words[idx/cppWordsz] >> (idx % cppWordsz)) & 1
+}
+
+func gcBitmapSize(size uintptr) uintptr {
+	return (size + cppWordsz - 1) / cppWordsz
+}
diff --git a/bitmap_test.go b/bitmap_test.go
new file mode 100644
index 0000000..ebf78ff
--- /dev/null
+++ b/bitmap_test.go
@@ -0,0 +1,102 @@
+// Copyright wasilibs authors
+// SPDX-License-Identifier: MIT
+
+package nottinygc
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestBitmap32Bits(t *testing.T) {
+	tests := []uintptr{
+		0,
+		0b1,
+		0b101,
+		0b111,
+		0b0001,
+		0b1000001,
+		0xFFFFFFFF,
+		0x11111111,
+		0x01010101,
+		0x0F0F0F0F,
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(fmt.Sprintf("%v", tc), func(t *testing.T) {
+			bm := newBitmap(32)
+			if len(bm.words) != 1 {
+				t.Fatalf("expected 1 word, got %v", len(bm.words))
+			}
+			for i := 0; i < 32; i++ {
+				if tc&(1<<i) != 0 {
+					bm.set(uintptr(i))
+				}
+			}
+
+			for i := 0; i < 32; i++ {
+				got := bm.get(uintptr(i))
+				if tc&(1<<i) != 0 {
+					if got == 0 {
+						t.Fatalf("expected bit %v to be set", i)
+					}
+				} else {
+					if got != 0 {
+						t.Fatalf("expected bit %v to be unset", i)
+					}
+				}
+			}
+		})
+	}
+}
+
+// Test for multiple words, we pick larger than 64-bits to have more than one word on Go
+// as well. We don't actually run CI with Go but it can be helpful for development.
+func TestBitmap128Bits(t *testing.T) {
+	// We'll just repeat these.
+	tests := []uintptr{
+		0,
+		0b1,
+		0b101,
+		0b111,
+		0b0001,
+		0b1000001,
+		0xFFFFFFFF,
+		0x11111111,
+		0x01010101,
+		0x0F0F0F0F,
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(fmt.Sprintf("%v", tc), func(t *testing.T) {
+			bm := newBitmap(128)
+			if cppWordsz == 32 && len(bm.words) != 4 || cppWordsz == 64 && len(bm.words) != 2 {
+				t.Fatalf("got %v words", len(bm.words))
+			}
+			for j := 0; j < 4; j++ {
+				for i := 0; i < 32; i++ {
+					if tc&(1<<(32*j+i)) != 0 {
+						bm.set(uintptr(i))
+					}
+				}
+			}
+
+			for j := 0; j < 4; j++ {
+				for i := 0; i < 32; i++ {
+					got := bm.get(uintptr(32*j + i))
+					if tc&(1<<(32*j+i)) != 0 {
+						if got == 0 {
+							t.Fatalf("expected bit %v to be set", i)
+						}
+					} else {
+						if got != 0 {
+							t.Fatalf("expected bit %v to be unset", i)
+						}
+					}
+				}
+			}
+		})
+	}
+}
diff --git a/finalizer.go b/finalizer.go
index 82ef4dd..17c4d36 100644
--- a/finalizer.go
+++ b/finalizer.go
@@ -8,8 +8,6 @@ package nottinygc
 /*
 void GC_register_finalizer(void* obj, void* fn, void* cd, void** ofn, void** ocn);
 void onFinalizer(void* obj, void* fn);
-void* malloc(unsigned int long);
-void free(void* ptr);
 */
 import "C"
 import "unsafe"
@@ -33,7 +31,7 @@ func SetFinalizer(obj interface{}, finalizer interface{}) {
 
 	in := (*_interface)(unsafe.Pointer(&obj))
 
-	rf := (*registeredFinalizer)(C.malloc(C.ulong(unsafe.Sizeof(registeredFinalizer{}))))
+	rf := (*registeredFinalizer)(cmalloc(unsafe.Sizeof(registeredFinalizer{})))
 	rf.typecode = in.typecode
 	rf.finKey = finKey
 
@@ -42,7 +40,7 @@ func SetFinalizer(obj interface{}, finalizer interface{}) {
 
 //export onFinalizer
 func onFinalizer(obj unsafe.Pointer, data unsafe.Pointer) {
-	defer C.free(data)
+	defer cfree(data)
 
 	rf := (*registeredFinalizer)(data)
 	finalizer := finalizers[rf.finKey]
diff --git a/gc.go b/gc.go
index 5e8a2f3..35104df 100644
--- a/gc.go
+++ b/gc.go
@@ -18,7 +18,7 @@ void* GC_malloc(unsigned int size);
 void* GC_malloc_atomic(unsigned int size);
 void* GC_malloc_explicitly_typed(unsigned int size, unsigned int gc_descr);
 void* GC_calloc_explicitly_typed(unsigned int nelements, unsigned int element_size, unsigned int gc_descr);
-unsigned int GC_make_descriptor(unsigned int* bm, unsigned int len);
+unsigned int GC_make_descriptor(void* bm, unsigned int len);
 void GC_free(void* ptr);
 void GC_gcollect();
 void GC_set_on_collection_event(void* f);
@@ -40,12 +40,11 @@ const (
 )
 
 const (
-	// CPP_WORDSZ is a simple integer constant representing the word size
-	cppWordsz  = uintptr(unsafe.Sizeof(uintptr(0)) * 8)
-	signb      = uintptr(1) << (cppWordsz - 1)
 	gcDsBitmap = uintptr(1)
 )
 
+var descriptorCache = newIntMap()
+
 //export onCollectionEvent
 func onCollectionEvent(eventType uint32) {
 	switch eventType {
@@ -98,9 +97,12 @@ func alloc(size uintptr, layoutPtr unsafe.Pointer) unsafe.Pointer {
 		}
 		layoutSz := (layout >> 1) & (1<<sizeFieldBits - 1)
 		layoutBm := layout >> (1 + sizeFieldBits)
-		buf = allocTyped(size, layoutSz, layoutBm)
-	} else {
+		buf = allocSmall(size, layoutSz, layoutBm)
+	} else if layoutPtr == nil {
+		// Unknown layout, assume all pointers.
 		buf = C.GC_malloc(C.uint(size))
+	} else {
+		buf = allocLarge(size, layoutPtr)
 	}
 	if buf == nil {
 		panic("out of memory")
@@ -108,23 +110,45 @@ func alloc(size uintptr, layoutPtr unsafe.Pointer) unsafe.Pointer {
 	return buf
 }
 
-func allocTyped(allocSz uintptr, layoutSz uintptr, layoutBm uintptr) unsafe.Pointer {
-	descr := gcDescr(layoutSz, layoutBm)
-	if descr == 0 {
+func allocSmall(allocSz uintptr, layoutSz uintptr, layoutBm uintptr) unsafe.Pointer {
+	desc := gcDescr(layoutBm)
+	if desc == 0 {
 		return C.GC_malloc_atomic(C.uint(allocSz))
 	}
 
+	return allocTyped(allocSz, layoutSz, desc)
+}
+
+func allocLarge(allocSz uintptr, layoutPtr unsafe.Pointer) unsafe.Pointer {
+	layoutSz := *(*uintptr)(layoutPtr)
+	desc, ok := descriptorCache.get(uintptr(layoutPtr))
+	if !ok {
+		bm := newBitmap(layoutSz)
+		bitsPtr := unsafe.Add(layoutPtr, unsafe.Sizeof(uintptr(0)))
+		for i := uintptr(0); i < layoutSz; i++ {
+			if (*(*uint8)(unsafe.Add(bitsPtr, i/8))>>(i%8))&1 != 0 {
+				bm.set(i)
+			}
+		}
+		desc = uintptr(C.GC_make_descriptor(unsafe.Pointer(&bm.words[0]), C.uint(layoutSz)))
+		descriptorCache.put(uintptr(layoutPtr), desc)
+	}
+
+	return allocTyped(allocSz, layoutSz, desc)
+}
+
+func allocTyped(allocSz uintptr, layoutSz uintptr, desc uintptr) unsafe.Pointer {
 	itemSz := layoutSz * unsafe.Sizeof(uintptr(0))
 	if itemSz == allocSz {
-		return C.GC_malloc_explicitly_typed(C.uint(allocSz), C.uint(descr))
+		return C.GC_malloc_explicitly_typed(C.uint(allocSz), C.uint(desc))
 	}
 	numItems := allocSz / itemSz
-	return C.GC_calloc_explicitly_typed(C.uint(numItems), C.uint(itemSz), C.uint(descr))
+	return C.GC_calloc_explicitly_typed(C.uint(numItems), C.uint(itemSz), C.uint(desc))
 }
 
 // Reimplementation of the simple bitmap case from bdwgc
 // https://github.com/ivmai/bdwgc/blob/806537be2dec4f49056cb2fe091ac7f7d78728a8/typd_mlc.c#L204
-func gcDescr(layoutSz uintptr, layoutBm uintptr) uintptr {
+func gcDescr(layoutBm uintptr) uintptr {
 	if layoutBm == 0 {
 		return 0 // no pointers
 	}
diff --git a/gc_notcustom.go b/gc_notcustom.go
index f15c6bc..87c2c71 100644
--- a/gc_notcustom.go
+++ b/gc_notcustom.go
@@ -1,7 +1,7 @@
 // Copyright wasilibs authors
 // SPDX-License-Identifier: MIT
 
-//go:build !gc.custom
+//go:build tinygo && !gc.custom
 
 package nottinygc
 
diff --git a/intmap.go b/intmap.go
new file mode 100644
index 0000000..f5b5571
--- /dev/null
+++ b/intmap.go
@@ -0,0 +1,135 @@
+// Copyright wasilibs authors
+// SPDX-License-Identifier: MIT
+
+package nottinygc
+
+import "C"
+import "unsafe"
+
+// Aim for initial over on the order of a few kilobytes.
+
+const (
+	initialBuckets = 512
+	numEmbedded    = 8
+)
+
+type item struct {
+	key uintptr
+	val uintptr
+}
+
+type extraNode struct {
+	next *extraNode
+	item item
+}
+
+type bucket struct {
+	embedded [numEmbedded]item
+	extra    *extraNode
+	count    byte
+}
+
+// intMap is a map from int to int. As it is used to cache descriptors within
+// allocation, it cannot itself allocate using the Go heap and uses malloc
+// instead. It also takes advantage of knowing we never replace values, so it
+// does not support replacement.
+type intMap struct {
+	buckets []bucket
+	count   int
+}
+
+func newIntMap() intMap {
+	return intMap{
+		buckets: newBuckets(initialBuckets),
+		count:   0,
+	}
+}
+
+func (m *intMap) put(key uintptr, val uintptr) {
+	if float64(m.count+1) > float64(len(m.buckets))*0.75 {
+		m.resize()
+	}
+	doPut(m.buckets, key, val)
+	m.count++
+}
+
+func doPut(buckets []bucket, key uintptr, val uintptr) {
+	pos := hash(key) % uintptr(len(buckets))
+	b := &buckets[pos]
+	if b.count < numEmbedded {
+		b.embedded[b.count] = item{key: key, val: val}
+	} else {
+		e := newExtraNode()
+		e.item = item{key: key, val: val}
+		e.next = b.extra
+		b.extra = e
+	}
+	b.count++
+}
+
+func (m *intMap) resize() {
+	newSz := len(m.buckets) * 2
+	newBkts := newBuckets(newSz)
+	for i := 0; i < len(m.buckets); i++ {
+		b := &m.buckets[i]
+		for j := 0; j < int(b.count); j++ {
+			if j < numEmbedded {
+				doPut(newBkts, b.embedded[j].key, b.embedded[j].val)
+			} else {
+				for n := b.extra; n != nil; {
+					doPut(newBkts, n.item.key, n.item.val)
+					next := n.next
+					cfree(unsafe.Pointer(n))
+					n = next
+				}
+			}
+		}
+	}
+	cfree(unsafe.Pointer(&m.buckets[0]))
+	m.buckets = newBkts
+}
+
+func (m *intMap) get(key uintptr) (uintptr, bool) {
+	pos := hash(key) % uintptr(len(m.buckets))
+	b := &m.buckets[pos]
+	for i := 0; i < int(b.count); i++ {
+		if i < numEmbedded {
+			if b.embedded[i].key == key {
+				return b.embedded[i].val, true
+			}
+		} else {
+			for n := b.extra; n != nil; n = n.next {
+				if n.item.key == key {
+					return n.item.val, true
+				}
+			}
+			break
+		}
+	}
+	return 0, false
+}
+
+func hash(key uintptr) uintptr {
+	// Use Java algorithm for cheap and easy handling of aligned values.
+	// There are better ones with more operations, we can try later if needed.
+	return key ^ (key >> 16)
+}
+
+func newBuckets(size int) []bucket {
+	sz := unsafe.Sizeof(bucket{}) * uintptr(size)
+	bucketsArr := cmalloc(sz)
+	for i := uintptr(0); i < sz; i++ {
+		*(*byte)(unsafe.Pointer(uintptr(bucketsArr) + i)) = 0
+	}
+	buckets := unsafe.Slice((*bucket)(bucketsArr), size)
+	return buckets
+}
+
+func newExtraNode() *extraNode {
+	sz := unsafe.Sizeof(extraNode{})
+	arr := cmalloc(sz)
+	for i := uintptr(0); i < sz; i++ {
+		*(*byte)(unsafe.Pointer(uintptr(arr) + i)) = 0
+	}
+	return (*extraNode)(arr)
+}
diff --git a/intmap_test.go b/intmap_test.go
new file mode 100644
index 0000000..53e9327
--- /dev/null
+++ b/intmap_test.go
@@ -0,0 +1,75 @@
+// Copyright wasilibs authors
+// SPDX-License-Identifier: MIT
+
+package nottinygc
+
+import "testing"
+
+func TestIntMapBasic(t *testing.T) {
+	m := newIntMap()
+	_, ok := m.get(5)
+	if ok {
+		t.Fatal("expected not ok in empty map")
+	}
+
+	m.put(5, 10)
+	v, ok := m.get(5)
+	if !ok {
+		t.Fatal("expected ok in map")
+	}
+	if v != 10 {
+		t.Fatal("expected 10 in map")
+	}
+}
+
+func TestIntMapNoResize(t *testing.T) {
+	top := int(0.75 * 512)
+	m := newIntMap()
+	for i := 0; i < top; i++ {
+		m.put(uintptr(i), uintptr(i))
+	}
+
+	if len(m.buckets) != 512 {
+		t.Fatal("expected 512 buckets")
+	}
+
+	for i := 0; i < top; i++ {
+		v, ok := m.get(uintptr(i))
+		if !ok {
+			t.Fatalf("expected %d to be in map", i)
+		}
+		if v != uintptr(i) {
+			t.Fatalf("expected %d to have value %d in map, got %d", i, i, v)
+		}
+	}
+	_, ok := m.get(uintptr(top))
+	if ok {
+		t.Fatal("expected not ok in map")
+	}
+}
+
+func TestIntMapResize(t *testing.T) {
+	top := 512
+	m := newIntMap()
+	for i := 0; i < top; i++ {
+		m.put(uintptr(i), uintptr(i))
+	}
+
+	if len(m.buckets) != 1024 {
+		t.Fatal("expected 1024 buckets")
+	}
+
+	for i := 0; i < top; i++ {
+		v, ok := m.get(uintptr(i))
+		if !ok {
+			t.Fatalf("expected %d to be in map", i)
+		}
+		if v != uintptr(i) {
+			t.Fatalf("expected %d to have value %d in map, got %d", i, i, v)
+		}
+	}
+	_, ok := m.get(uintptr(top))
+	if ok {
+		t.Fatal("expected not ok in map")
+	}
+}
diff --git a/malloc.go b/malloc.go
new file mode 100644
index 0000000..c15972b
--- /dev/null
+++ b/malloc.go
@@ -0,0 +1,18 @@
+// Copyright wasilibs authors
+// SPDX-License-Identifier: MIT
+
+package nottinygc
+
+/*
+#include <stdlib.h>
+*/
+import "C"
+import "unsafe"
+
+func cmalloc(size uintptr) unsafe.Pointer {
+	return C.malloc(C.size_t(size))
+}
+
+func cfree(ptr unsafe.Pointer) {
+	C.free(ptr)
+}