From 42caea220f675cdafa83557147e579e07d998b6b Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 22 Jun 2022 15:40:14 -0400 Subject: [PATCH] Reland "gc: avoid cpu stalls when starting" This reverts commit 4801b6ce9a4d737ad1722ca620a7bc8087590f2e and adds the safepoints needed to catch the unsafe->safe transition also. --- src/gc.c | 17 +---------------- src/julia_threads.h | 6 +++--- src/rtutils.c | 4 +--- src/safepoint.c | 36 ++++++++++++++++++++++++++++++++---- src/signals-mach.c | 3 ++- 5 files changed, 39 insertions(+), 27 deletions(-) diff --git a/src/gc.c b/src/gc.c index 4221cb8e83f15..74f3bb6d75a1d 100644 --- a/src/gc.c +++ b/src/gc.c @@ -190,22 +190,7 @@ NOINLINE uintptr_t gc_get_stack_ptr(void) #define should_timeout() 0 -static void jl_gc_wait_for_the_world(void) -{ - if (jl_n_threads > 1) - jl_wake_libuv(); - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; - // This acquire load pairs with the release stores - // in the signal handler of safepoint so we are sure that - // all the stores on those threads are visible. - // We're currently also using atomic store release in mutator threads - // (in jl_gc_state_set), but we may want to use signals to flush the - // memory operations on those threads lazily instead. - while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) - jl_cpu_pause(); // yield? - } -} +void jl_gc_wait_for_the_world(void); // malloc wrappers, aligned allocation diff --git a/src/julia_threads.h b/src/julia_threads.h index 6f1c4e50d4e95..2f962e3d28445 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -340,9 +340,9 @@ STATIC_INLINE int8_t jl_gc_state_set(jl_ptls_t ptls, int8_t state, int8_t old_state) { jl_atomic_store_release(&ptls->gc_state, state); - // A safe point is required if we transition from GC-safe region to - // non GC-safe region. - if (old_state && !state) + if (state == JL_GC_STATE_SAFE && old_state == 0) + jl_gc_safepoint_(ptls); + if (state == 0 && old_state == JL_GC_STATE_SAFE) jl_gc_safepoint_(ptls); return old_state; } diff --git a/src/rtutils.c b/src/rtutils.c index f3a2e745ed651..3ec61e14a40a9 100644 --- a/src/rtutils.c +++ b/src/rtutils.c @@ -276,9 +276,7 @@ JL_DLLEXPORT void jl_eh_restore_state(jl_handler_t *eh) ct->ptls->defer_signal = eh->defer_signal; if (old_gc_state != eh->gc_state) { jl_atomic_store_release(&ct->ptls->gc_state, eh->gc_state); - if (old_gc_state) { - jl_gc_safepoint_(ct->ptls); - } + jl_gc_safepoint_(ct->ptls); } if (old_defer_signal && !eh->defer_signal) { jl_sigint_safepoint(ct->ptls); diff --git a/src/safepoint.c b/src/safepoint.c index b2feccf74e068..2bb7b008353d3 100644 --- a/src/safepoint.c +++ b/src/safepoint.c @@ -43,7 +43,8 @@ uint8_t jl_safepoint_enable_cnt[3] = {0, 0, 0}; // load/store so that threads waiting for the GC doesn't have to also // fight on the safepoint lock... uv_mutex_t safepoint_lock; -uv_cond_t safepoint_cond; +uv_cond_t safepoint_cond_begin; +uv_cond_t safepoint_cond_end; static void jl_safepoint_enable(int idx) JL_NOTSAFEPOINT { @@ -88,7 +89,8 @@ static void jl_safepoint_disable(int idx) JL_NOTSAFEPOINT void jl_safepoint_init(void) { uv_mutex_init(&safepoint_lock); - uv_cond_init(&safepoint_cond); + uv_cond_init(&safepoint_cond_begin); + uv_cond_init(&safepoint_cond_end); // jl_page_size isn't available yet. size_t pgsz = jl_getpagesize(); #ifdef _OS_WINDOWS_ @@ -109,6 +111,31 @@ void jl_safepoint_init(void) jl_safepoint_pages = addr; } +void jl_gc_wait_for_the_world(void) +{ + assert(jl_n_threads); + if (jl_n_threads > 1) + jl_wake_libuv(); + for (int i = 0; i < jl_n_threads; i++) { + jl_ptls_t ptls2 = jl_all_tls_states[i]; + // This acquire load pairs with the release stores + // in the signal handler of safepoint so we are sure that + // all the stores on those threads are visible. + // We're currently also using atomic store release in mutator threads + // (in jl_gc_state_set), but we may want to use signals to flush the + // memory operations on those threads lazily instead. + while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) { + // Use system mutexes rather than spin locking to minimize wasted CPU time + // while we wait for other threads reach a safepoint. + // This is particularly important when run under rr. + uv_mutex_lock(&safepoint_lock); + if (!jl_atomic_load_relaxed(&ptls2->gc_state)) + uv_cond_wait(&safepoint_cond_begin, &safepoint_lock); + uv_mutex_unlock(&safepoint_lock); + } + } +} + int jl_safepoint_start_gc(void) { if (jl_n_threads == 1) { @@ -153,13 +180,14 @@ void jl_safepoint_end_gc(void) jl_mach_gc_end(); # endif uv_mutex_unlock(&safepoint_lock); - uv_cond_broadcast(&safepoint_cond); + uv_cond_broadcast(&safepoint_cond_end); } void jl_safepoint_wait_gc(void) { // The thread should have set this is already assert(jl_atomic_load_relaxed(&jl_current_task->ptls->gc_state) != 0); + uv_cond_broadcast(&safepoint_cond_begin); // Use normal volatile load in the loop for speed until GC finishes. // Then use an acquire load to make sure the GC result is visible on this thread. while (jl_atomic_load_relaxed(&jl_gc_running) || jl_atomic_load_acquire(&jl_gc_running)) { @@ -168,7 +196,7 @@ void jl_safepoint_wait_gc(void) // This is particularly important when run under rr. uv_mutex_lock(&safepoint_lock); if (jl_atomic_load_relaxed(&jl_gc_running)) - uv_cond_wait(&safepoint_cond, &safepoint_lock); + uv_cond_wait(&safepoint_cond_end, &safepoint_lock); uv_mutex_unlock(&safepoint_lock); } } diff --git a/src/signals-mach.c b/src/signals-mach.c index ff1cc8f0a72f8..480f056bba8f0 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -42,7 +42,7 @@ static void attach_exception_port(thread_port_t thread, int segv_only); // low 16 bits are the thread id, the next 8 bits are the original gc_state static arraylist_t suspended_threads; extern uv_mutex_t safepoint_lock; -extern uv_cond_t safepoint_cond; +extern uv_cond_t safepoint_cond_begin; void jl_mach_gc_end(void) { // Requires the safepoint lock to be held @@ -85,6 +85,7 @@ static int jl_mach_gc_wait(jl_ptls_t ptls2, arraylist_push(&suspended_threads, (void*)item); thread_suspend(thread); uv_mutex_unlock(&safepoint_lock); + uv_cond_broadcast(&safepoint_cond_begin); return 1; }