Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Page based heap size heuristics #50144

Merged
merged 13 commits into from
Jul 23, 2023
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Language changes

Compiler/Runtime improvements
-----------------------------
* Updated GC heuristics to count allocated pages instead of individual objects ([#50144]).

Command-line option changes
---------------------------
Expand Down
255 changes: 255 additions & 0 deletions batch.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
diff --git a/src/gc.c b/src/gc.c
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove?

index c85d1e5455..c82b2b645d 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -6,6 +6,8 @@
#include "julia_gcext.h"
#include "julia_assert.h"
#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
#include <sys/types.h>
#ifdef __GLIBC__
#include <malloc.h> // for malloc_trim
@@ -1004,8 +1006,14 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
jl_atomic_store_relaxed(&ptls->gc_num.bigalloc,
jl_atomic_load_relaxed(&ptls->gc_num.bigalloc) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz);
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + sz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
#ifdef MEMDEBUG
memset(v, 0xee, allocsz);
#endif
@@ -1051,8 +1059,10 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT
if (nxt)
nxt->prev = pv;
gc_num.freed += v->sz&~3;
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, v->sz&~3);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(v->sz&~3));
+ jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
+ jl_atomic_load_relaxed(&gc_heap_stats.heap_size) + (v->sz&~3));
+ jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
+ jl_atomic_load_relaxed(&gc_heap_stats.heap_size) + (v->sz&~3));
#ifdef MEMDEBUG
memset(v, 0xbb, v->sz&~3);
#endif
@@ -1112,8 +1122,14 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
jl_ptls_t ptls = jl_current_task->ptls;
jl_atomic_store_relaxed(&ptls->gc_num.allocd,
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz);
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + sz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
}

static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
@@ -1126,12 +1142,15 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
jl_ptls_t ptls = gc_all_tls_states[i];
if (ptls) {
dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval);
- dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed);
dest->malloc += jl_atomic_load_relaxed(&ptls->gc_num.malloc);
dest->realloc += jl_atomic_load_relaxed(&ptls->gc_num.realloc);
dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_num.poolalloc);
dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc);
- dest->freecall += jl_atomic_load_relaxed(&ptls->gc_num.freecall);
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh);
+ jl_atomic_store_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + jl_atomic_load_relaxed(&gc_heap_stats.bytes_mallocd));
+ jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed));
+ jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_thresh - free_thresh + jl_atomic_load_relaxed(&gc_heap_stats.heap_size));
}
}
}
@@ -1188,8 +1207,10 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
jl_free_aligned(d);
else
free(d);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, jl_array_nbytes(a));
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -jl_array_nbytes(a));
+ jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed,
+ jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed) + jl_array_nbytes(a));
+ jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
+ jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_array_nbytes(a));
gc_num.freed += jl_array_nbytes(a);
gc_num.freecall++;
}
@@ -3589,8 +3610,14 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
jl_atomic_store_relaxed(&ptls->gc_num.malloc,
jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz);
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + sz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
}
return malloc(sz);
}
@@ -3606,8 +3633,14 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz);
jl_atomic_store_relaxed(&ptls->gc_num.malloc,
jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, nm*sz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, nm*sz);
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + sz*nm < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz*nm);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz*nm);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz*nm);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
}
return calloc(nm, sz);
}
@@ -3619,12 +3652,15 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
free(p);
if (pgcstack != NULL && ct->world_age) {
jl_ptls_t ptls = ct->ptls;
- jl_atomic_store_relaxed(&ptls->gc_num.freed,
- jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz);
- jl_atomic_store_relaxed(&ptls->gc_num.freecall,
- jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, sz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -sz);
+ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh);
+ if (free_thresh + sz < 128*1024) {
+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + sz);
+ }
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + sz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + sz));
+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0);
+ }
}
}

@@ -3635,17 +3671,28 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
if (pgcstack != NULL && ct->world_age) {
jl_ptls_t ptls = ct->ptls;
maybe_collect(ptls);
- if (sz < old)
- jl_atomic_store_relaxed(&ptls->gc_num.freed,
- jl_atomic_load_relaxed(&ptls->gc_num.freed) + (old - sz));
- else
+ if (!(sz < old))
jl_atomic_store_relaxed(&ptls->gc_num.allocd,
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old));
jl_atomic_store_relaxed(&ptls->gc_num.realloc,
jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, old);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz-old);
+
+ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh);
+ if (free_thresh + old < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + old);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + old);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + old));
+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0);
+ }
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + sz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
}
return realloc(p, sz);
}
@@ -3720,8 +3767,14 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
jl_atomic_store_relaxed(&ptls->gc_num.malloc,
jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz);
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + sz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
int last_errno = errno;
#ifdef _OS_WINDOWS_
DWORD last_error = GetLastError();
@@ -3752,17 +3805,28 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz;
inc_live_bytes(allocsz - oldsz);
}
- else if (allocsz < oldsz)
- jl_atomic_store_relaxed(&ptls->gc_num.freed,
- jl_atomic_load_relaxed(&ptls->gc_num.freed) + (oldsz - allocsz));
- else
+ else if (!(allocsz < oldsz))
jl_atomic_store_relaxed(&ptls->gc_num.allocd,
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz));
jl_atomic_store_relaxed(&ptls->gc_num.realloc,
jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, oldsz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz-oldsz);
+
+ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh);
+ if (free_thresh + oldsz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + oldsz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + oldsz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + oldsz));
+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0);
+ }
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + allocsz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + allocsz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + allocsz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + allocsz);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
int last_errno = errno;
#ifdef _OS_WINDOWS_
DWORD last_error = GetLastError();
diff --git a/src/julia_threads.h b/src/julia_threads.h
index f4c235243e..a672a92fb9 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -130,12 +130,12 @@ typedef struct {

typedef struct {
_Atomic(int64_t) allocd;
- _Atomic(int64_t) freed;
_Atomic(uint64_t) malloc;
_Atomic(uint64_t) realloc;
_Atomic(uint64_t) poolalloc;
_Atomic(uint64_t) bigalloc;
- _Atomic(uint64_t) freecall;
+ _Atomic(int64_t) free_thresh; // fiels used to batch fetch add operations for the GC
+ _Atomic(uint64_t) alloc_thresh;
} jl_thread_gc_num_t;

typedef struct {
12 changes: 9 additions & 3 deletions doc/src/devdocs/gc.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ This scheme eliminates the need of explicitly keeping a flag to indicate a full
## Heuristics

GC heuristics tune the GC by changing the size of the allocation interval between garbage collections.
If a GC was unproductive, then we increase the size of the allocation interval to allow objects more time to die.
If a GC returns a lot of space we can shrink the interval. The goal is to find a steady state where we are
allocating just about the same amount as we are collecting.

The GC heuristics measure how big the heap size is after a collection and set the next
collection according to the algorithm described by https://dl.acm.org/doi/10.1145/3563323,
in summary, it argues that the heap target should have a square root relationship with the live heap, and that it should also be scaled by how fast the GC is freeing objects and how fast the mutators are allocating.
The heuristics measure the heap size by counting the number of pages that are in use and the objects that use malloc. Previously we measured the heap size by counting
the alive objects, but that doesn't take into account fragmentation which could lead to bad decisions, that also meant that we used thread local information (allocations) to make
decisions about a process wide (when to GC), measuring pages means the decision is global.

The GC will do full collections when the heap size reaches 80% of the maximum allowed size.
17 changes: 15 additions & 2 deletions src/gc-debug.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
// This file is a part of Julia. License is MIT: https://julialang.org/license

#include "gc.h"
#include "julia.h"
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>

// re-include assert.h without NDEBUG,
Expand Down Expand Up @@ -1216,15 +1219,25 @@ JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
gc_logging_enabled = enable;
}

void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT {
void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
if (!gc_logging_enabled) {
return;
}
jl_safe_printf("GC: pause %.2fms. collected %fMB. %s %s\n",
pause/1e6, freed/1e6,
pause/1e6, freed/(double)(1<<20),
full ? "full" : "incr",
recollect ? "recollect" : ""
);

jl_safe_printf("Heap stats: bytes_mapped %.2f MB, bytes_resident %.2f MB, heap_size %.2f MB, heap_target %.2f MB, live_bytes %.2f MB\n, Fragmentation %.3f",
jl_atomic_load_relaxed(&gc_heap_stats.bytes_mapped)/(double)(1<<20),
jl_atomic_load_relaxed(&gc_heap_stats.bytes_resident)/(double)(1<<20),
jl_atomic_load_relaxed(&gc_heap_stats.heap_size)/(double)(1<<20),
jl_atomic_load_relaxed(&gc_heap_stats.heap_target)/(double)(1<<20),
live_bytes/(double)(1<<20),
(double)live_bytes/(double)jl_atomic_load_relaxed(&gc_heap_stats.heap_size)
);
// Should fragmentation use bytes_resident instead of heap_size?
}

#ifdef __cplusplus
Expand Down
4 changes: 4 additions & 0 deletions src/gc-pages.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ char *jl_gc_try_alloc_pages_(int pg_cnt) JL_NOTSAFEPOINT
// round data pointer up to the nearest gc_page_data-aligned
// boundary if mmap didn't already do so.
mem = (char*)gc_page_data(mem + GC_PAGE_SZ - 1);
jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mapped, pages_sz);
jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, pages_sz);
return mem;
}

Expand Down Expand Up @@ -115,6 +117,7 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT
// try to get page from `pool_freed`
meta = pop_lf_page_metadata_back(&global_page_pool_freed);
if (meta != NULL) {
jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, GC_PAGE_SZ);
gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED);
goto exit;
}
Expand Down Expand Up @@ -188,6 +191,7 @@ void jl_gc_free_page(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
madvise(p, decommit_size, MADV_DONTNEED);
#endif
msan_unpoison(p, decommit_size);
jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, -decommit_size);
}

#ifdef __cplusplus
Expand Down
Loading