From 5f36833b71e8bcce6ab9c89e8e98246638e465a1 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel@gmail.com>
Date: Mon, 3 Jul 2023 10:06:22 -0300
Subject: [PATCH] Implement new GC heuristics.

---
 NEWS.md               |   1 +
 doc/src/devdocs/gc.md |  12 ++-
 src/gc-debug.c        |  22 ++++-
 src/gc-pages.c        |   4 +
 src/gc.c              | 192 +++++++++++++++++++++++-------------------
 src/gc.h              |  16 +++-
 6 files changed, 156 insertions(+), 91 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 631806296eebd..2af55a888fc91 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -9,6 +9,7 @@ Language changes
 
 Compiler/Runtime improvements
 -----------------------------
+* Updated GC heuristics to count allocated pages instead of individual objects ([#50144]).
 
 Command-line option changes
 ---------------------------
diff --git a/doc/src/devdocs/gc.md b/doc/src/devdocs/gc.md
index c072912e77c3f..942535f426b34 100644
--- a/doc/src/devdocs/gc.md
+++ b/doc/src/devdocs/gc.md
@@ -67,6 +67,12 @@ This scheme eliminates the need of explicitly keeping a flag to indicate a full
 ## Heuristics
 
 GC heuristics tune the GC by changing the size of the allocation interval between garbage collections.
-If a GC was unproductive, then we increase the size of the allocation interval to allow objects more time to die.
-If a GC returns a lot of space we can shrink the interval. The goal is to find a steady state where we are
-allocating just about the same amount as we are collecting.
+
+The GC heuristics measure how big the heap size is after a collection and set the next
+collection according to the algorithm described by https://dl.acm.org/doi/10.1145/3563323,
+in summary, it argues that the heap target should have a square root relationship with the live heap, and that it should also be scaled by how fast the GC is freeing objects and how fast the mutators are allocating.
+The heuristics measure the heap size by counting the number of pages that are in use and the objects that use malloc. Previously we measured the heap size by counting
+the alive objects, but that doesn't take into account fragmentation which could lead to bad decisions, that also meant that we used thread local information (allocations) to make
+decisions about a process wide (when to GC), measuring pages means the decision is global.
+
+The GC will do full collections when the heap size reaches 80% of the maximum allowed size.
diff --git a/src/gc-debug.c b/src/gc-debug.c
index bab2c5b0fa607..56441ae09b8e6 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1,7 +1,10 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
 #include "gc.h"
+#include "julia.h"
 #include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
 #include <stdio.h>
 
 // re-include assert.h without NDEBUG,
@@ -1216,15 +1219,30 @@ JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
     gc_logging_enabled = enable;
 }
 
-void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT {
+void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
         return;
     }
     jl_safe_printf("GC: pause %.2fms. collected %fMB. %s %s\n",
-        pause/1e6, freed/1e6,
+        pause/1e6, freed/(double)(1<<20),
         full ? "full" : "incr",
         recollect ? "recollect" : ""
     );
+
+    jl_safe_printf("Heap stats: bytes_mapped %.2f MB, bytes_allocd %.2f MB\nbytes_freed %.2f MB, bytes_mallocd %.1f, malloc_bytes_freed %.2f MB\npages_perm_allocd %zu, heap_size %.2f MB, heap_target %.2f MB, live_bytes %.2f MB\n",
+        jl_atomic_load_relaxed(&gc_heap_stats.bytes_mapped)/(double)(1<<20),
+        jl_atomic_load_relaxed(&gc_heap_stats.bytes_allocd)/(double)(1<<20),
+        jl_atomic_load_relaxed(&gc_heap_stats.bytes_freed)/(double)(1<<20),
+        jl_atomic_load_relaxed(&gc_heap_stats.bytes_mallocd)/(double)(1<<20),
+        jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed)/(double)(1<<20),
+        jl_atomic_load_relaxed(&gc_heap_stats.pages_perm_allocd),
+        jl_atomic_load_relaxed(&gc_heap_stats.heap_size)/(double)(1<<20),
+        jl_atomic_load_relaxed(&gc_heap_stats.heap_target)/(double)(1<<20),
+        live_bytes/(double)(1<<20)
+    );
+    double bytes_mapped = (jl_atomic_load_relaxed(&gc_heap_stats.bytes_resident) + jl_atomic_load_relaxed(&gc_heap_stats.bytes_mallocd) - jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed))/(double)(1<<20);
+    jl_safe_printf("Fragmentation %f, mapped_bytes %.2f MB\n", (double)live_bytes/(double)jl_atomic_load_relaxed(&gc_heap_stats.heap_size), bytes_mapped);
+    // Should fragmentation use bytes_resident instead of heap_size?
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-pages.c b/src/gc-pages.c
index 682e76611f5d9..8d596f4a815ca 100644
--- a/src/gc-pages.c
+++ b/src/gc-pages.c
@@ -52,6 +52,8 @@ char *jl_gc_try_alloc_pages_(int pg_cnt) JL_NOTSAFEPOINT
         // round data pointer up to the nearest gc_page_data-aligned
         // boundary if mmap didn't already do so.
         mem = (char*)gc_page_data(mem + GC_PAGE_SZ - 1);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mapped, pages_sz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, pages_sz);
     return mem;
 }
 
@@ -115,6 +117,7 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT
     // try to get page from `pool_freed`
     meta = pop_lf_page_metadata_back(&global_page_pool_freed);
     if (meta != NULL) {
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, GC_PAGE_SZ);
         gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED);
         goto exit;
     }
@@ -188,6 +191,7 @@ void jl_gc_free_page(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
     madvise(p, decommit_size, MADV_DONTNEED);
 #endif
     msan_unpoison(p, decommit_size);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, -decommit_size);
 }
 
 #ifdef __cplusplus
diff --git a/src/gc.c b/src/gc.c
index 9fd93b7340d56..325c4b023c824 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -178,6 +178,8 @@ jl_gc_num_t gc_num = {0};
 static size_t last_long_collect_interval;
 int gc_n_threads;
 jl_ptls_t* gc_all_tls_states;
+gc_heapstatus_t gc_heap_stats = {0};
+int next_sweep_full = 0;
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void)
 {
@@ -665,19 +667,26 @@ static int64_t last_gc_total_bytes = 0;
 #ifdef _P64
 typedef uint64_t memsize_t;
 static const size_t default_collect_interval = 5600 * 1024 * sizeof(void*);
-static const size_t max_collect_interval = 1250000000UL;
 static size_t total_mem;
 // We expose this to the user/ci as jl_gc_set_max_memory
 static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024;
 #else
 typedef uint32_t memsize_t;
 static const size_t default_collect_interval = 3200 * 1024 * sizeof(void*);
-static const size_t max_collect_interval =  500000000UL;
 // Work really hard to stay within 2GB
 // Alternative is to risk running out of address space
 // on 32 bit architectures.
-static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024;
+#define MAX32HEAP 1536 * 1024 * 1024
+static memsize_t max_total_memory = (memsize_t) MAX32HEAP;
 #endif
+// heuristic stuff for https://dl.acm.org/doi/10.1145/3563323
+static uint64_t old_pause_time = 0;
+static uint64_t old_mut_time = 0;
+static uint64_t old_heap_size = 0;
+static uint64_t old_alloc_diff = 0;
+static uint64_t old_freed_diff = 0;
+static uint64_t gc_end_time = 0;
+
 
 // global variables for GC stats
 
@@ -912,7 +921,7 @@ void gc_setmark_buf(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) JL
 
 STATIC_INLINE void maybe_collect(jl_ptls_t ptls)
 {
-    if (jl_atomic_load_relaxed(&ptls->gc_num.allocd) >= 0 || jl_gc_debug_check_other()) {
+    if (jl_atomic_load_relaxed(&gc_heap_stats.heap_size) >= jl_atomic_load_relaxed(&gc_heap_stats.heap_target) || jl_gc_debug_check_other()) {
         jl_gc_collect(JL_GC_AUTO);
     }
     else {
@@ -1001,6 +1010,8 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
         jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
     jl_atomic_store_relaxed(&ptls->gc_num.bigalloc,
         jl_atomic_load_relaxed(&ptls->gc_num.bigalloc) + 1);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz);
 #ifdef MEMDEBUG
     memset(v, 0xee, allocsz);
 #endif
@@ -1046,6 +1057,8 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT
             if (nxt)
                 nxt->prev = pv;
             gc_num.freed += v->sz&~3;
+            jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, v->sz&~3);
+            jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(v->sz&~3));
 #ifdef MEMDEBUG
             memset(v, 0xbb, v->sz&~3);
 #endif
@@ -1105,6 +1118,8 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
     jl_ptls_t ptls = jl_current_task->ptls;
     jl_atomic_store_relaxed(&ptls->gc_num.allocd,
         jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz);
 }
 
 static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
@@ -1179,6 +1194,8 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
             jl_free_aligned(d);
         else
             free(d);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, jl_array_nbytes(a));
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -jl_array_nbytes(a));
         gc_num.freed += jl_array_nbytes(a);
         gc_num.freecall++;
     }
@@ -1250,6 +1267,8 @@ static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT
     set_page_metadata(pg);
     push_page_metadata_back(&ptls->page_metadata_allocd, pg);
     jl_taggedvalue_t *fl = gc_reset_page(ptls, p, pg);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_allocd, GC_PAGE_SZ);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, GC_PAGE_SZ);
     p->newpages = fl;
     return fl;
 }
@@ -1443,8 +1462,12 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo
     }
     else if (freed_lazily) {
         push_page_metadata_back(lazily_freed, pg);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_freed, GC_PAGE_SZ);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -GC_PAGE_SZ);
     }
     else {
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_freed, GC_PAGE_SZ);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -GC_PAGE_SZ);
     #ifdef _P64 // only enable concurrent sweeping on 64bit
         if (jl_n_sweepthreads == 0) {
             jl_gc_free_page(pg);
@@ -3056,6 +3079,11 @@ JL_DLLEXPORT int64_t jl_gc_live_bytes(void)
     return live_bytes;
 }
 
+double jl_gc_smooth(uint64_t old_val, uint64_t new_val, double factor)
+{
+    return factor * old_val + (1.0-factor) * new_val;
+}
+
 size_t jl_maxrss(void);
 
 // Only one thread should be running in this function
@@ -3070,6 +3098,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     jl_gc_markqueue_t *mq = &ptls->mark_queue;
 
     uint64_t gc_start_time = jl_hrtime();
+    uint64_t mutator_time = gc_start_time - gc_end_time;
+    uint64_t before_free_heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size);
     int64_t last_perm_scanned_bytes = perm_scanned_bytes;
     uint64_t start_mark_time = jl_hrtime();
     JL_PROBE_GC_MARK_BEGIN();
@@ -3160,19 +3190,14 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     uint64_t mark_time = end_mark_time - start_mark_time;
     gc_num.mark_time = mark_time;
     gc_num.total_mark_time += mark_time;
-    int64_t allocd = gc_num.allocd;
     gc_settime_postmark_end();
     // marking is over
 
     // Flush everything in mark cache
     gc_sync_all_caches_nolock(ptls);
 
-    int64_t live_sz_ub = live_bytes + allocd;
-    int64_t live_sz_est = scanned_bytes + perm_scanned_bytes;
-    int64_t estimate_freed = live_sz_ub - live_sz_est;
 
     gc_verify(ptls);
-
     gc_stats_all_pool();
     gc_stats_big_obj();
     objprofile_printall();
@@ -3181,42 +3206,17 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     if (!prev_sweep_full)
         promoted_bytes += perm_scanned_bytes - last_perm_scanned_bytes;
     // 5. next collection decision
-    int not_freed_enough = (collection == JL_GC_AUTO) && estimate_freed < (7*(allocd/10));
-    int nptr = 0;
+    int remset_nptr = 0;
+    int sweep_full = next_sweep_full;
+    int recollect = 0;
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
         if (ptls2 != NULL)
-            nptr += ptls2->heap.remset_nptr;
+            remset_nptr += ptls2->heap.remset_nptr;
     }
+    (void)remset_nptr; //Use this information for something?
 
-    // many pointers in the intergen frontier => "quick" mark is not quick
-    int large_frontier = nptr*sizeof(void*) >= default_collect_interval;
-    int sweep_full = 0;
-    int recollect = 0;
-
-    // update heuristics only if this GC was automatically triggered
-    if (collection == JL_GC_AUTO) {
-        if (large_frontier) {
-            sweep_full = 1;
-            gc_num.interval = last_long_collect_interval;
-        }
-        if (not_freed_enough || large_frontier) {
-            gc_num.interval = gc_num.interval * 2;
-        }
-
-        size_t maxmem = 0;
-#ifdef _P64
-        // on a big memory machine, increase max_collect_interval to totalmem / nthreads / 2
-        maxmem = total_mem / (gc_n_threads - jl_n_gcthreads) / 2;
-#endif
-        if (maxmem < max_collect_interval)
-            maxmem = max_collect_interval;
-        if (gc_num.interval > maxmem) {
-            sweep_full = 1;
-            gc_num.interval = maxmem;
-        }
-    }
 
     // If the live data outgrows the suggested max_total_memory
     // we keep going with minimum intervals and full gcs until
@@ -3236,7 +3236,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         // on the first collection after sweep_full, and the current scan
         perm_scanned_bytes = 0;
         promoted_bytes = 0;
-        last_long_collect_interval = gc_num.interval;
     }
     scanned_bytes = 0;
     // 6. start sweeping
@@ -3261,9 +3260,10 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         if (sweep_full)
             gc_sweep_perm_alloc();
     }
+
     JL_PROBE_GC_SWEEP_END();
 
-    uint64_t gc_end_time = jl_hrtime();
+    gc_end_time = jl_hrtime();
     uint64_t pause = gc_end_time - gc_start_time;
     uint64_t sweep_time = gc_end_time - start_sweep_time;
     gc_num.total_sweep_time += sweep_time;
@@ -3272,6 +3272,44 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         gc_num.last_full_sweep = gc_end_time;
     }
 
+    size_t heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size);
+    uint64_t alloc_diff = before_free_heap_size - old_heap_size;
+    uint64_t freed_diff = before_free_heap_size - heap_size;
+
+    double alloc_smooth_factor = 0.95;
+    double collec_smooth_factor = 0.5;
+    double tuning_factor = 0.03;
+    double alloc_mem = jl_gc_smooth(old_alloc_diff, alloc_diff, alloc_smooth_factor);
+    double alloc_time = jl_gc_smooth(old_mut_time, mutator_time, alloc_smooth_factor);
+    double gc_mem = jl_gc_smooth(old_freed_diff, freed_diff, collec_smooth_factor);
+    double gc_time = jl_gc_smooth(old_pause_time, pause, collec_smooth_factor);
+    old_alloc_diff = alloc_diff;
+    old_mut_time = mutator_time;
+    old_freed_diff = freed_diff;
+    old_pause_time = pause;
+    old_heap_size = heap_size;
+    double min_interval = default_collect_interval;
+    double target_allocs;
+    if (alloc_mem != 0 && alloc_time != 0 && gc_mem != 0 && gc_time != 0) {
+        double alloc_rate = alloc_mem/alloc_time;
+        double gc_rate = gc_mem/gc_time;
+        target_allocs = sqrt(((double)heap_size/min_interval * alloc_rate)/(gc_rate * tuning_factor)); // work on multiples of min interval
+    }
+    else
+        target_allocs = 2*sqrt((double)heap_size/min_interval);
+
+    uint64_t target_heap = (uint64_t)target_allocs*min_interval + heap_size;
+    if (target_heap > max_total_memory)
+        target_heap = max_total_memory;
+    else if (target_heap < default_collect_interval)
+        target_heap = default_collect_interval;
+    jl_atomic_store_relaxed(&gc_heap_stats.heap_target, target_heap);
+
+    double old_ratio = (double)promoted_bytes/(double)heap_size;
+    if (heap_size > max_total_memory * 0.8 || old_ratio > 0.15)
+        next_sweep_full = 1;
+    else
+        next_sweep_full = 0;
     // sweeping is over
     // 7. if it is a quick sweep, put back the remembered objects in queued state
     // so that we don't trigger the barrier again on them.
@@ -3303,55 +3341,19 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     }
 #endif
 
-    _report_gc_finished(pause, gc_num.freed, sweep_full, recollect);
-
-    gc_final_pause_end(gc_start_time, gc_end_time);
-    gc_time_sweep_pause(gc_end_time, allocd, live_bytes,
-                        estimate_freed, sweep_full);
-    gc_num.full_sweep += sweep_full;
+    _report_gc_finished(pause, gc_num.freed, sweep_full, recollect, live_bytes);
     uint64_t max_memory = last_live_bytes + gc_num.allocd;
     if (max_memory > gc_num.max_memory) {
         gc_num.max_memory = max_memory;
     }
-
+    gc_final_pause_end(gc_start_time, gc_end_time);
+    gc_time_sweep_pause(gc_end_time, allocd, live_bytes,
+                        estimate_freed, sweep_full);
+    gc_num.full_sweep += sweep_full;
     last_live_bytes = live_bytes;
-    // Can't call inc_live_bytes here because we already added allocd
-    // to the graph earlier
     live_bytes += -gc_num.freed + gc_num.allocd;
     jl_timing_counter_dec(JL_TIMING_COUNTER_HeapSize, gc_num.freed);
 
-    if (collection == JL_GC_AUTO) {
-        //If we aren't freeing enough or are seeing lots and lots of pointers let it increase faster
-        if (!not_freed_enough || large_frontier) {
-            int64_t tot = 2 * (live_bytes + gc_num.allocd) / 3;
-            if (gc_num.interval > tot) {
-                gc_num.interval = tot;
-                last_long_collect_interval = tot;
-            }
-        // If the current interval is larger than half the live data decrease the interval
-        }
-        else {
-            int64_t half = (live_bytes / 2);
-            if (gc_num.interval > half)
-                gc_num.interval = half;
-        }
-
-        // But never go below default
-        if (gc_num.interval < default_collect_interval) gc_num.interval = default_collect_interval;
-    }
-
-    if (gc_num.interval + live_bytes > max_total_memory) {
-        if (live_bytes < max_total_memory) {
-            gc_num.interval = max_total_memory - live_bytes;
-            last_long_collect_interval = max_total_memory - live_bytes;
-        }
-        else {
-            // We can't stay under our goal so let's go back to
-            // the minimum interval and hope things get better
-            gc_num.interval = default_collect_interval;
-        }
-    }
-
     gc_time_summary(sweep_full, t_start, gc_end_time, gc_num.freed,
                     live_bytes, gc_num.interval, pause,
                     gc_num.time_to_safepoint,
@@ -3542,7 +3544,7 @@ void jl_gc_init(void)
 
     arraylist_new(&finalizer_list_marked, 0);
     arraylist_new(&to_finalize, 0);
-
+    jl_atomic_store_relaxed(&gc_heap_stats.heap_target, default_collect_interval);
     gc_num.interval = default_collect_interval;
     last_long_collect_interval = default_collect_interval;
     gc_num.allocd = 0;
@@ -3558,7 +3560,7 @@ void jl_gc_init(void)
     if (total_mem < 128e9)
         percent = total_mem * 2.34375e-12 + 0.6; // 60% at 0 gigs and 90% at 128 to not
     else                                         // overcommit too much on memory contrained devices
-        percent = 0.9;
+        percent = 0.8;
     max_total_memory = total_mem * percent;
 #endif
     if (jl_options.heap_size_hint)
@@ -3571,7 +3573,11 @@ JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem)
 {
     if (max_mem > 0
         && max_mem < (uint64_t)1 << (sizeof(memsize_t) * 8 - 1)) {
+        #ifdef _P64
         max_total_memory = max_mem;
+        #else
+        max_total_memory = max_mem < MAX32HEAP ? max_mem : MAX32HEAP;
+        #endif
     }
 }
 
@@ -3599,6 +3605,8 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
         jl_atomic_store_relaxed(&ptls->gc_num.malloc,
             jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz);
     }
     return malloc(sz);
 }
@@ -3614,6 +3622,8 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz);
         jl_atomic_store_relaxed(&ptls->gc_num.malloc,
             jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, nm*sz);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, nm*sz);
     }
     return calloc(nm, sz);
 }
@@ -3629,6 +3639,8 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
             jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz);
         jl_atomic_store_relaxed(&ptls->gc_num.freecall,
             jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, sz);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -sz);
     }
 }
 
@@ -3647,6 +3659,9 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
                 jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old));
         jl_atomic_store_relaxed(&ptls->gc_num.realloc,
             jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, old);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz-old);
     }
     return realloc(p, sz);
 }
@@ -3721,6 +3736,8 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
         jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
     jl_atomic_store_relaxed(&ptls->gc_num.malloc,
         jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz);
     int last_errno = errno;
 #ifdef _OS_WINDOWS_
     DWORD last_error = GetLastError();
@@ -3759,7 +3776,9 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz));
     jl_atomic_store_relaxed(&ptls->gc_num.realloc,
         jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
-
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, oldsz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz-oldsz);
     int last_errno = errno;
 #ifdef _OS_WINDOWS_
     DWORD last_error = GetLastError();
@@ -3848,6 +3867,8 @@ static void *gc_perm_alloc_large(size_t sz, int zero, unsigned align, unsigned o
 #ifdef _OS_WINDOWS_
     SetLastError(last_error);
 #endif
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_allocd,sz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size,sz);
     errno = last_errno;
     jl_may_leak(base);
     assert(align > 0);
@@ -3891,6 +3912,7 @@ void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offs
     errno = last_errno;
     if (__unlikely(pool == MAP_FAILED))
         return NULL;
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.pages_perm_allocd, 1);
 #endif
     gc_perm_pool = (uintptr_t)pool;
     gc_perm_end = gc_perm_pool + GC_PERM_POOL_SIZE;
diff --git a/src/gc.h b/src/gc.h
index b1eee5a1d5bda..6e95cc93711b8 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -10,6 +10,7 @@
 #define JL_GC_H
 
 #include <stddef.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <strings.h>
@@ -257,6 +258,18 @@ typedef struct {
     pagetable1_t *meta1[REGION2_PG_COUNT];
 } pagetable_t;
 
+typedef struct {
+    _Atomic(size_t) bytes_mapped;
+    _Atomic(size_t) bytes_resident;
+    _Atomic(size_t) bytes_freed;
+    _Atomic(size_t) bytes_allocd;
+    _Atomic(size_t) bytes_mallocd;
+    _Atomic(size_t) malloc_bytes_freed;
+    _Atomic(size_t) pages_perm_allocd;
+    _Atomic(size_t) heap_size;
+    _Atomic(size_t) heap_target;
+} gc_heapstatus_t;
+
 #define GC_PAGE_UNMAPPED        0
 #define GC_PAGE_ALLOCATED       1
 #define GC_PAGE_LAZILY_FREED    2
@@ -374,6 +387,7 @@ extern int64_t lazy_freed_pages;
 extern int gc_first_tid;
 extern int gc_n_threads;
 extern jl_ptls_t* gc_all_tls_states;
+extern gc_heapstatus_t gc_heap_stats;
 
 STATIC_INLINE bigval_t *bigval_header(jl_taggedvalue_t *o) JL_NOTSAFEPOINT
 {
@@ -637,7 +651,7 @@ void gc_count_pool(void);
 size_t jl_array_nbytes(jl_array_t *a) JL_NOTSAFEPOINT;
 
 JL_DLLEXPORT void jl_enable_gc_logging(int enable);
-void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT;
+void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT;
 
 #ifdef __cplusplus
 }