From 4537c8b113b00726e63c06f44ec9cc22c1363905 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Sun, 21 Jan 2024 16:57:20 -0300 Subject: [PATCH] reduce contention on page metadata lists during the sweeping phase (#52943) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **EDIT**: fixes https://github.com/JuliaLang/julia/issues/52937 by decreasing the contention on the page lists and only waking GC threads up if we have a sufficiently large number of pages. Seems to address the regression from the MWE of https://github.com/JuliaLang/julia/issues/52937: - master: ``` ../julia-master/julia --project=. run_benchmarks.jl serial obj_arrays issue-52937 -n5 --gcthreads=1 bench = "issue-52937.jl" ┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐ │ │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │ │ │ ms │ ms │ ms │ ms │ ms │ us │ MB │ % │ ├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤ │ minimum │ 24841 │ 818 │ 78 │ 740 │ 44 │ 10088 │ 96 │ 3 │ │ median │ 24881 │ 834 │ 83 │ 751 │ 45 │ 10738 │ 97 │ 3 │ │ maximum │ 25002 │ 891 │ 87 │ 803 │ 48 │ 11074 │ 112 │ 4 │ │ stdev │ 78 │ 29 │ 4 │ 26 │ 1 │ 393 │ 7 │ 0 │ └─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘ ../julia-master/julia --project=. run_benchmarks.jl serial obj_arrays issue-52937 -n5 --gcthreads=8 bench = "issue-52937.jl" ┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐ │ │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │ │ │ ms │ ms │ ms │ ms │ ms │ us │ MB │ % │ ├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤ │ minimum │ 29113 │ 5200 │ 68 │ 5130 │ 12 │ 9724 │ 95 │ 18 │ │ median │ 29354 │ 5274 │ 69 │ 5204 │ 12 │ 10456 │ 96 │ 18 │ │ maximum │ 29472 │ 5333 │ 70 │ 5264 │ 14 │ 11913 │ 97 │ 18 │ │ stdev │ 138 │ 54 │ 1 │ 55 │ 1 │ 937 │ 1 │ 0 │ └─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘ ``` - PR: ``` ../julia-master/julia --project=. run_benchmarks.jl serial obj_arrays issue-52937 -n5 --gcthreads=1 bench = "issue-52937.jl" ┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐ │ │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │ │ │ ms │ ms │ ms │ ms │ ms │ us │ MB │ % │ ├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤ │ minimum │ 24475 │ 761 │ 77 │ 681 │ 40 │ 9499 │ 94 │ 3 │ │ median │ 24845 │ 775 │ 80 │ 698 │ 43 │ 10793 │ 97 │ 3 │ │ maximum │ 25128 │ 811 │ 85 │ 726 │ 47 │ 12820 │ 113 │ 3 │ │ stdev │ 240 │ 22 │ 3 │ 21 │ 3 │ 1236 │ 8 │ 0 │ └─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘ ../julia-master/julia --project=. run_benchmarks.jl serial obj_arrays issue-52937 -n5 --gcthreads=8 bench = "issue-52937.jl" ┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐ │ │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │ │ │ ms │ ms │ ms │ ms │ ms │ us │ MB │ % │ ├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤ │ minimum │ 24709 │ 679 │ 70 │ 609 │ 11 │ 9981 │ 95 │ 3 │ │ median │ 24869 │ 702 │ 70 │ 631 │ 12 │ 10705 │ 96 │ 3 │ │ maximum │ 24911 │ 708 │ 72 │ 638 │ 13 │ 10820 │ 98 │ 3 │ │ stdev │ 79 │ 12 │ 1 │ 12 │ 1 │ 401 │ 1 │ 0 │ └─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘ ``` Also, performance on `objarray.jl` (an example of benchmark in which sweeping parallelizes well with the current implementation) seems fine: - master: ``` ../julia-master/julia --project=. run_benchmarks.jl multithreaded bigarrays -n5 --gcthreads=1 bench = "objarray.jl" ┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐ │ │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │ │ │ ms │ ms │ ms │ ms │ ms │ us │ MB │ % │ ├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤ │ minimum │ 19301 │ 10792 │ 7485 │ 3307 │ 1651 │ 196 │ 4519 │ 56 │ │ median │ 21415 │ 12646 │ 9094 │ 3551 │ 1985 │ 241 │ 6576 │ 59 │ │ maximum │ 21873 │ 13118 │ 9353 │ 3765 │ 2781 │ 330 │ 8793 │ 60 │ │ stdev │ 1009 │ 932 │ 757 │ 190 │ 449 │ 50 │ 1537 │ 2 │ └─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘ ../julia-master/julia --project=. run_benchmarks.jl multithreaded bigarrays -n5 --gcthreads=8 bench = "objarray.jl" ┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐ │ │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │ │ │ ms │ ms │ ms │ ms │ ms │ us │ MB │ % │ ├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤ │ minimum │ 13135 │ 4377 │ 3350 │ 1007 │ 491 │ 231 │ 6062 │ 33 │ │ median │ 13164 │ 4540 │ 3370 │ 1177 │ 669 │ 256 │ 6383 │ 35 │ │ maximum │ 13525 │ 4859 │ 3675 │ 1184 │ 748 │ 320 │ 7528 │ 36 │ │ stdev │ 183 │ 189 │ 146 │ 77 │ 129 │ 42 │ 584 │ 1 │ └─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘ ``` - PR: ``` ../julia-master/julia --project=. run_benchmarks.jl multithreaded bigarrays -n5 --gcthreads=1 bench = "objarray.jl" ┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐ │ │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │ │ │ ms │ ms │ ms │ ms │ ms │ us │ MB │ % │ ├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤ │ minimum │ 19642 │ 10931 │ 7566 │ 3365 │ 1653 │ 204 │ 5688 │ 56 │ │ median │ 21441 │ 12717 │ 8948 │ 3770 │ 1796 │ 217 │ 6972 │ 59 │ │ maximum │ 23494 │ 14643 │ 10576 │ 4067 │ 2513 │ 248 │ 8229 │ 62 │ │ stdev │ 1408 │ 1339 │ 1079 │ 267 │ 393 │ 19 │ 965 │ 2 │ └─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘ ../julia-master/julia --project=. run_benchmarks.jl multithreaded bigarrays -n5 --gcthreads=8 bench = "objarray.jl" ┌─────────┬────────────┬─────────┬───────────┬────────────┬──────────────┬───────────────────┬──────────┬────────────┐ │ │ total time │ gc time │ mark time │ sweep time │ max GC pause │ time to safepoint │ max heap │ percent gc │ │ │ ms │ ms │ ms │ ms │ ms │ us │ MB │ % │ ├─────────┼────────────┼─────────┼───────────┼────────────┼──────────────┼───────────────────┼──────────┼────────────┤ │ minimum │ 13365 │ 4544 │ 3389 │ 1104 │ 516 │ 255 │ 6349 │ 34 │ │ median │ 13445 │ 4624 │ 3404 │ 1233 │ 578 │ 275 │ 6385 │ 34 │ │ maximum │ 14413 │ 5278 │ 3837 │ 1441 │ 753 │ 300 │ 7547 │ 37 │ │ stdev │ 442 │ 303 │ 194 │ 121 │ 89 │ 18 │ 522 │ 1 │ └─────────┴────────────┴─────────┴───────────┴────────────┴──────────────┴───────────────────┴──────────┴────────────┘ ``` --- src/gc.c | 124 +++++++++++++++++++++++++++++++++++++++++++++------- src/gc.h | 46 ++++++++++++++++++- src/partr.c | 2 +- 3 files changed, 154 insertions(+), 18 deletions(-) diff --git a/src/gc.c b/src/gc.c index 693243a415756..562d60905806b 100644 --- a/src/gc.c +++ b/src/gc.c @@ -21,8 +21,8 @@ int jl_n_sweepthreads; _Atomic(int) gc_n_threads_marking; // Number of threads sweeping _Atomic(int) gc_n_threads_sweeping; -// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping -_Atomic(jl_gc_page_stack_t *) gc_allocd_scratch; +// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing) +_Atomic(jl_gc_padded_page_stack_t *) gc_allocd_scratch; // `tid` of mutator thread that triggered GC _Atomic(int) gc_master_tid; // `tid` of first GC thread @@ -1586,8 +1586,72 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_ pg->nfree = nfree; } -void gc_sweep_wake_all(void) +// pre-scan pages to check whether there are enough pages so that's worth parallelizing +// also sweeps pages that don't need to be linearly scanned +int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch) { + // 4MB worth of pages is worth parallelizing + const int n_pages_worth_parallel_sweep = (int)(4 * (1 << 20) / GC_PAGE_SZ); + int n_pages_to_scan = 0; + gc_page_profiler_serializer_t serializer = gc_page_serializer_create(); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) { + continue; + } + jl_gc_page_stack_t *dest = &new_gc_allocd_scratch[ptls2->tid].stack; + jl_gc_page_stack_t tmp; + jl_gc_pagemeta_t *tail = NULL; + memset(&tmp, 0, sizeof(tmp)); + while (1) { + jl_gc_pagemeta_t *pg = pop_lf_back_nosync(&ptls2->page_metadata_allocd); + if (pg == NULL) { + break; + } + int should_scan = 1; + if (!pg->has_marked) { + should_scan = 0; + } + if (!current_sweep_full && !pg->has_young) { + assert(!prev_sweep_full || pg->prev_nold >= pg->nold); + if (!prev_sweep_full || pg->prev_nold == pg->nold) { + should_scan = 0; + } + } + if (should_scan) { + if (tail == NULL) { + tail = pg; + } + n_pages_to_scan++; + push_lf_back_nosync(&tmp, pg); + } + else { + gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg); + } + if (n_pages_to_scan >= n_pages_worth_parallel_sweep) { + break; + } + } + if (tail != NULL) { + tail->next = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); + } + ptls2->page_metadata_allocd = tmp; + if (n_pages_to_scan >= n_pages_worth_parallel_sweep) { + break; + } + } + gc_page_serializer_destroy(&serializer); + return n_pages_to_scan >= n_pages_worth_parallel_sweep; +} + +// wake up all threads to sweep the pages +void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch) +{ + int parallel_sweep_worthwhile = gc_sweep_prescan(ptls, new_gc_allocd_scratch); + jl_atomic_store(&gc_allocd_scratch, new_gc_allocd_scratch); + if (!parallel_sweep_worthwhile) { + return; + } uv_mutex_lock(&gc_threads_lock); for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; @@ -1597,6 +1661,7 @@ void gc_sweep_wake_all(void) uv_mutex_unlock(&gc_threads_lock); } +// wait for all threads to finish sweeping void gc_sweep_wait_for_all(void) { jl_atomic_store(&gc_allocd_scratch, NULL); @@ -1605,36 +1670,58 @@ void gc_sweep_wait_for_all(void) } } -void gc_sweep_pool_parallel(void) +// sweep all pools +void gc_sweep_pool_parallel(jl_ptls_t ptls) { jl_atomic_fetch_add(&gc_n_threads_sweeping, 1); - jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch); + jl_gc_padded_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch); if (allocd_scratch != NULL) { gc_page_profiler_serializer_t serializer = gc_page_serializer_create(); while (1) { int found_pg = 0; + // sequentially walk the threads and sweep the pages for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + // skip foreign threads that already exited if (ptls2 == NULL) { continue; } - jl_gc_page_stack_t *allocd = &allocd_scratch[t_i]; - jl_gc_pagemeta_t *pg = pop_lf_back(&ptls2->page_metadata_allocd); + jl_gc_page_stack_t *dest = &allocd_scratch[ptls2->tid].stack; + jl_gc_pagemeta_t *pg = try_pop_lf_back(&ptls2->page_metadata_allocd); + // failed steal attempt if (pg == NULL) { continue; } - gc_sweep_pool_page(&serializer, allocd, &ptls2->page_metadata_buffered, pg); + gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg); found_pg = 1; } if (!found_pg) { - break; + // check for termination + int no_more_work = 1; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + // skip foreign threads that already exited + if (ptls2 == NULL) { + continue; + } + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); + if (pg != NULL) { + no_more_work = 0; + break; + } + } + if (no_more_work) { + break; + } } + jl_cpu_pause(); } gc_page_serializer_destroy(&serializer); } jl_atomic_fetch_add(&gc_n_threads_sweeping, -1); } +// free all pages (i.e. through `madvise` on Linux) that were lazily freed void gc_free_pages(void) { while (1) { @@ -1659,7 +1746,7 @@ static void gc_sweep_pool(void) // allocate enough space to hold the end of the free list chain // for every thread and pool size - jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) alloca(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**)); + jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) malloc_s(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**)); // update metadata of pages that were pointed to by freelist or newpages from a pool // i.e. pages being the current allocation target @@ -1701,17 +1788,18 @@ static void gc_sweep_pool(void) } // the actual sweeping - jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t)); - memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t)); - jl_atomic_store(&gc_allocd_scratch, tmp); - gc_sweep_wake_all(); - gc_sweep_pool_parallel(); + jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) malloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t)); + memset(new_gc_allocd_scratch, 0, n_threads * sizeof(jl_gc_padded_page_stack_t)); + jl_ptls_t ptls = jl_current_task->ptls; + gc_sweep_wake_all(ptls, new_gc_allocd_scratch); + gc_sweep_pool_parallel(ptls); gc_sweep_wait_for_all(); + // reset half-pages pointers for (int t_i = 0; t_i < n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) { - ptls2->page_metadata_allocd = tmp[t_i]; + ptls2->page_metadata_allocd = new_gc_allocd_scratch[t_i].stack; for (int i = 0; i < JL_GC_N_POOLS; i++) { jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; p->newpages = NULL; @@ -1749,6 +1837,10 @@ static void gc_sweep_pool(void) } } + // cleanup + free(pfl); + free(new_gc_allocd_scratch); + #ifdef _P64 // only enable concurrent sweeping on 64bit // wake thread up to sweep concurrently if (jl_n_sweepthreads > 0) { diff --git a/src/gc.h b/src/gc.h index 1d63c1d96e397..5357dcbc60d00 100644 --- a/src/gc.h +++ b/src/gc.h @@ -195,6 +195,23 @@ extern jl_gc_page_stack_t global_page_pool_freed; // in the sweeping phase, which also doesn't push a node into the // same stack after it's popped +STATIC_INLINE void push_lf_back_nosync(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT +{ + jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom); + elt->next = old_back; + jl_atomic_store_relaxed(&pool->bottom, elt); +} + +STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back_nosync(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT +{ + jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom); + if (old_back == NULL) { + return NULL; + } + jl_atomic_store_relaxed(&pool->bottom, old_back->next); + return old_back; +} + STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT { while (1) { @@ -207,6 +224,23 @@ STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) } } +#define MAX_POP_ATTEMPTS (1 << 10) + +STATIC_INLINE jl_gc_pagemeta_t *try_pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT +{ + for (int i = 0; i < MAX_POP_ATTEMPTS; i++) { + jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom); + if (old_back == NULL) { + return NULL; + } + if (jl_atomic_cmpswap(&pool->bottom, &old_back, old_back->next)) { + return old_back; + } + jl_cpu_pause(); + } + return NULL; +} + STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT { while (1) { @@ -220,6 +254,16 @@ STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFE jl_cpu_pause(); } } +typedef struct { + jl_gc_page_stack_t stack; + // pad to 128 bytes to avoid false-sharing +#ifdef _P64 + void *_pad[15]; +#else + void *_pad[31]; +#endif +} jl_gc_padded_page_stack_t; +static_assert(sizeof(jl_gc_padded_page_stack_t) == 128, "jl_gc_padded_page_stack_t is not 128 bytes"); typedef struct { _Atomic(size_t) n_freed_objs; @@ -461,7 +505,7 @@ void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_ void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq); void gc_mark_loop_serial(jl_ptls_t ptls); void gc_mark_loop_parallel(jl_ptls_t ptls, int master); -void gc_sweep_pool_parallel(void); +void gc_sweep_pool_parallel(jl_ptls_t ptls); void gc_free_pages(void); void sweep_stack_pools(void); void jl_gc_debug_init(void); diff --git a/src/partr.c b/src/partr.c index 23a252b537f99..ffb8859f1b0ea 100644 --- a/src/partr.c +++ b/src/partr.c @@ -143,7 +143,7 @@ void jl_parallel_gc_threadfun(void *arg) gc_mark_loop_parallel(ptls, 0); } if (may_sweep(ptls)) { // not an else! - gc_sweep_pool_parallel(); + gc_sweep_pool_parallel(ptls); jl_atomic_fetch_add(&ptls->gc_sweeps_requested, -1); } }