JuliaLang · oscardssmith · Jul 23, 2023 · Jul 3, 2023 · Jul 5, 2023 · Jul 7, 2023
diff --git a/NEWS.md b/NEWS.md
@@ -9,6 +9,7 @@ Language changes
 
 Compiler/Runtime improvements
 -----------------------------
+* Updated GC heuristics to count allocated pages instead of individual objects ([#50144]).
 
 Command-line option changes
 ---------------------------

diff --git a/batch.diff b/batch.diff
@@ -0,0 +1,255 @@
+diff --git a/src/gc.c b/src/gc.c
+index c85d1e5455..c82b2b645d 100644
+--- a/src/gc.c
++++ b/src/gc.c
+@@ -6,6 +6,8 @@
+ #include "julia_gcext.h"
+ #include "julia_assert.h"
+ #include <math.h>
++#include <stddef.h>
++#include <stdint.h>
+ #include <sys/types.h>
+ #ifdef __GLIBC__
+ #include <malloc.h> // for malloc_trim
+@@ -1004,8 +1006,14 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
+         jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
+     jl_atomic_store_relaxed(&ptls->gc_num.bigalloc,
+         jl_atomic_load_relaxed(&ptls->gc_num.bigalloc) + 1);
+-    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz);
+-    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz);
++    uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
++    if (alloc_thresh + sz < 128*1024)
++        jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
++    else {
++        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
++        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
++        jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
++    }
+ #ifdef MEMDEBUG
+     memset(v, 0xee, allocsz);
+ #endif
+@@ -1051,8 +1059,10 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT
+             if (nxt)
+                 nxt->prev = pv;
+             gc_num.freed += v->sz&~3;
+-            jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, v->sz&~3);
+-            jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(v->sz&~3));
++            jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
++                jl_atomic_load_relaxed(&gc_heap_stats.heap_size) + (v->sz&~3));
++            jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
++                jl_atomic_load_relaxed(&gc_heap_stats.heap_size) + (v->sz&~3));
+ #ifdef MEMDEBUG
+             memset(v, 0xbb, v->sz&~3);
+ #endif
+@@ -1112,8 +1122,14 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
+     jl_ptls_t ptls = jl_current_task->ptls;
+     jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+         jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
+-    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
+-    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz);
++    uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
++    if (alloc_thresh + sz < 128*1024)
++        jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
++    else {
++        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
++        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
++        jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
++    }
+ }
+
+ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
+@@ -1126,12 +1142,15 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
+         jl_ptls_t ptls = gc_all_tls_states[i];
+         if (ptls) {
+             dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval);
+-            dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed);
+             dest->malloc += jl_atomic_load_relaxed(&ptls->gc_num.malloc);
+             dest->realloc += jl_atomic_load_relaxed(&ptls->gc_num.realloc);
+             dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_num.poolalloc);
+             dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc);
+-            dest->freecall += jl_atomic_load_relaxed(&ptls->gc_num.freecall);
++            uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
++            uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh);
++            jl_atomic_store_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + jl_atomic_load_relaxed(&gc_heap_stats.bytes_mallocd));
++            jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed));
++            jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_thresh - free_thresh + jl_atomic_load_relaxed(&gc_heap_stats.heap_size));
+         }
+     }
+ }
+@@ -1188,8 +1207,10 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
+             jl_free_aligned(d);
+         else
+             free(d);
+-        jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, jl_array_nbytes(a));
+-        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -jl_array_nbytes(a));
++        jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed,
++            jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed) + jl_array_nbytes(a));
++        jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
++            jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_array_nbytes(a));
+         gc_num.freed += jl_array_nbytes(a);
+         gc_num.freecall++;
+     }
+@@ -3589,8 +3610,14 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
+             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
+         jl_atomic_store_relaxed(&ptls->gc_num.malloc,
+             jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+-        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
+-        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz);
++        uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
++        if (alloc_thresh + sz < 128*1024)
++            jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
++        else {
++            jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
++            jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
++            jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
++        }
+     }
+     return malloc(sz);
+ }
+@@ -3606,8 +3633,14 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
+             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz);
+         jl_atomic_store_relaxed(&ptls->gc_num.malloc,
+             jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+-        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, nm*sz);
+-        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, nm*sz);
++        uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
++        if (alloc_thresh + sz*nm < 128*1024)
++            jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz*nm);
++        else {
++            jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz*nm);
++            jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz*nm);
++            jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
++        }
+     }
+     return calloc(nm, sz);
+ }
+@@ -3619,12 +3652,15 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
+     free(p);
+     if (pgcstack != NULL && ct->world_age) {
+         jl_ptls_t ptls = ct->ptls;
+-        jl_atomic_store_relaxed(&ptls->gc_num.freed,
+-            jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz);
+-        jl_atomic_store_relaxed(&ptls->gc_num.freecall,
+-            jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1);
+-        jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, sz);
+-        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -sz);
++        uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh);
++        if (free_thresh + sz < 128*1024) {
++            jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + sz);
++        }
++        else {
++            jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + sz);
++            jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + sz));
++            jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0);
++        }
+     }
+ }
+
+@@ -3635,17 +3671,28 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
+     if (pgcstack != NULL && ct->world_age) {
+         jl_ptls_t ptls = ct->ptls;
+         maybe_collect(ptls);
+-        if (sz < old)
+-            jl_atomic_store_relaxed(&ptls->gc_num.freed,
+-                jl_atomic_load_relaxed(&ptls->gc_num.freed) + (old - sz));
+-        else
++        if (!(sz < old))
+             jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+                 jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old));
+         jl_atomic_store_relaxed(&ptls->gc_num.realloc,
+             jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
+-        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
+-        jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, old);
+-        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz-old);
++
++        uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh);
++        if (free_thresh + old < 128*1024)
++            jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + old);
++        else {
++            jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + old);
++            jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + old));
++            jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0);
++        }
++        uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
++        if (alloc_thresh + sz < 128*1024)
++            jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
++        else {
++            jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
++            jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
++            jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
++        }
+     }
+     return realloc(p, sz);
+ }
+@@ -3720,8 +3767,14 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
+         jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
+     jl_atomic_store_relaxed(&ptls->gc_num.malloc,
+         jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+-    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz);
+-    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz);
++    uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
++    if (alloc_thresh + sz < 128*1024)
++        jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
++    else {
++        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
++        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
++        jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
++    }
+     int last_errno = errno;
+ #ifdef _OS_WINDOWS_
+     DWORD last_error = GetLastError();
+@@ -3752,17 +3805,28 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
+         ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz;
+         inc_live_bytes(allocsz - oldsz);
+     }
+-    else if (allocsz < oldsz)
+-        jl_atomic_store_relaxed(&ptls->gc_num.freed,
+-            jl_atomic_load_relaxed(&ptls->gc_num.freed) + (oldsz - allocsz));
+-    else
++    else if (!(allocsz < oldsz))
+         jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz));
+     jl_atomic_store_relaxed(&ptls->gc_num.realloc,
+         jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
+-    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz);
+-    jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, oldsz);
+-    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz-oldsz);
++
++    uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh);
++    if (free_thresh + oldsz < 128*1024)
++        jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + oldsz);
++    else {
++        jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + oldsz);
++        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + oldsz));
++        jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0);
++    }
++    uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
++    if (alloc_thresh + allocsz < 128*1024)
++        jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + allocsz);
++    else {
++        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + allocsz);
++        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + allocsz);
++        jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
++    }
+     int last_errno = errno;
+ #ifdef _OS_WINDOWS_
+     DWORD last_error = GetLastError();
+diff --git a/src/julia_threads.h b/src/julia_threads.h
+index f4c235243e..a672a92fb9 100644
+--- a/src/julia_threads.h
++++ b/src/julia_threads.h
+@@ -130,12 +130,12 @@ typedef struct {
+
+ typedef struct {
+     _Atomic(int64_t) allocd;
+-    _Atomic(int64_t) freed;
+     _Atomic(uint64_t) malloc;
+     _Atomic(uint64_t) realloc;
+     _Atomic(uint64_t) poolalloc;
+     _Atomic(uint64_t) bigalloc;
+-    _Atomic(uint64_t) freecall;
++    _Atomic(int64_t) free_thresh; // fiels used to batch fetch add operations for the GC
++    _Atomic(uint64_t) alloc_thresh;
+ } jl_thread_gc_num_t;
+
+ typedef struct {
diff --git a/doc/src/devdocs/gc.md b/doc/src/devdocs/gc.md
@@ -67,6 +67,12 @@ This scheme eliminates the need of explicitly keeping a flag to indicate a full
 ## Heuristics
 
 GC heuristics tune the GC by changing the size of the allocation interval between garbage collections.
-If a GC was unproductive, then we increase the size of the allocation interval to allow objects more time to die.
-If a GC returns a lot of space we can shrink the interval. The goal is to find a steady state where we are
-allocating just about the same amount as we are collecting.
+
+The GC heuristics measure how big the heap size is after a collection and set the next
+collection according to the algorithm described by https://dl.acm.org/doi/10.1145/3563323,
+in summary, it argues that the heap target should have a square root relationship with the live heap, and that it should also be scaled by how fast the GC is freeing objects and how fast the mutators are allocating.
+The heuristics measure the heap size by counting the number of pages that are in use and the objects that use malloc. Previously we measured the heap size by counting
+the alive objects, but that doesn't take into account fragmentation which could lead to bad decisions, that also meant that we used thread local information (allocations) to make
+decisions about a process wide (when to GC), measuring pages means the decision is global.
+
+The GC will do full collections when the heap size reaches 80% of the maximum allowed size.
diff --git a/src/gc-debug.c b/src/gc-debug.c
@@ -1,7 +1,10 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
 #include "gc.h"
+#include "julia.h"
 #include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
 #include <stdio.h>
 
 // re-include assert.h without NDEBUG,
@@ -1216,15 +1219,25 @@ JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
     gc_logging_enabled = enable;
 }
 
-void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT {
+void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
         return;
     }
     jl_safe_printf("GC: pause %.2fms. collected %fMB. %s %s\n",
-        pause/1e6, freed/1e6,
+        pause/1e6, freed/(double)(1<<20),
         full ? "full" : "incr",
         recollect ? "recollect" : ""
     );
+
+    jl_safe_printf("Heap stats: bytes_mapped %.2f MB, bytes_resident %.2f MB, heap_size %.2f MB, heap_target %.2f MB, live_bytes %.2f MB\n, Fragmentation %.3f",
+        jl_atomic_load_relaxed(&gc_heap_stats.bytes_mapped)/(double)(1<<20),
+        jl_atomic_load_relaxed(&gc_heap_stats.bytes_resident)/(double)(1<<20),
+        jl_atomic_load_relaxed(&gc_heap_stats.heap_size)/(double)(1<<20),
+        jl_atomic_load_relaxed(&gc_heap_stats.heap_target)/(double)(1<<20),
+        live_bytes/(double)(1<<20),
+        (double)live_bytes/(double)jl_atomic_load_relaxed(&gc_heap_stats.heap_size)
+    );
+    // Should fragmentation use bytes_resident instead of heap_size?
 }
 
 #ifdef __cplusplus

diff --git a/src/gc-pages.c b/src/gc-pages.c
@@ -52,6 +52,8 @@ char *jl_gc_try_alloc_pages_(int pg_cnt) JL_NOTSAFEPOINT
         // round data pointer up to the nearest gc_page_data-aligned
         // boundary if mmap didn't already do so.
         mem = (char*)gc_page_data(mem + GC_PAGE_SZ - 1);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mapped, pages_sz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, pages_sz);
     return mem;
 }
 
@@ -115,6 +117,7 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT
     // try to get page from `pool_freed`
     meta = pop_lf_page_metadata_back(&global_page_pool_freed);
     if (meta != NULL) {
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, GC_PAGE_SZ);
         gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED);
         goto exit;
     }
@@ -188,6 +191,7 @@ void jl_gc_free_page(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
     madvise(p, decommit_size, MADV_DONTNEED);
 #endif
     msan_unpoison(p, decommit_size);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, -decommit_size);
 }
 
 #ifdef __cplusplus
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,6 +9,7 @@ Language changes @@
     Compiler/Runtime improvements
     -----------------------------
+    * Updated GC heuristics to count allocated pages instead of individual objects ([#50144]).
     Command-line option changes
     ---------------------------
@@ Expand Down @@