From bf0cd8cfd7ff2308cbb0191bf13be01e1e0a7a0c Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Tue, 12 Dec 2023 18:27:19 +0000 Subject: [PATCH] Reland "gc: avoid cpu stalls when starting" This reverts commit 4801b6ce9a4d737ad1722ca620a7bc8087590f2e and adds the safepoints needed to catch the unsafe->safe transition also. --- src/gc-debug.c | 1 - src/gc.c | 27 +------------------- src/julia_threads.h | 6 ++--- src/safepoint.c | 61 ++++++++++++++++++++++++++++++++++++++++++--- src/signals-mach.c | 4 +++ 5 files changed, 65 insertions(+), 34 deletions(-) diff --git a/src/gc-debug.c b/src/gc-debug.c index 87861a5d89b9c7..84db461ce2b0a2 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -83,7 +83,6 @@ void add_lostval_parent(jl_value_t *parent) innocent looking functions which allocate (and thus trigger marking) only on special cases. If you can't find it, you can try the following : - - Ensure that should_timeout() is deterministic instead of clock based. - Once you have a completely deterministic program which crashes on gc_verify, the addresses should stay constant between different runs (with same binary, same environment ...). Do not forget to turn off ASLR (linux: echo 0 > /proc/sys/kernel/randomize_va_space). diff --git a/src/gc.c b/src/gc.c index 17975a075dea44..31e2da6ff997a2 100644 --- a/src/gc.c +++ b/src/gc.c @@ -222,32 +222,7 @@ NOINLINE uintptr_t gc_get_stack_ptr(void) return (uintptr_t)jl_get_frame_addr(); } -#define should_timeout() 0 - -void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads) -{ - JL_TIMING(GC, GC_Stop); -#ifdef USE_TRACY - TracyCZoneCtx ctx = JL_TIMING_DEFAULT_BLOCK->tracy_ctx; - TracyCZoneColor(ctx, 0x696969); -#endif - assert(gc_n_threads); - if (gc_n_threads > 1) - jl_wake_libuv(); - for (int i = 0; i < gc_n_threads; i++) { - jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 != NULL) { - // This acquire load pairs with the release stores - // in the signal handler of safepoint so we are sure that - // all the stores on those threads are visible. - // We're currently also using atomic store release in mutator threads - // (in jl_gc_state_set), but we may want to use signals to flush the - // memory operations on those threads lazily instead. - while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) - jl_cpu_pause(); // yield? - } - } -} +void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads); // malloc wrappers, aligned allocation diff --git a/src/julia_threads.h b/src/julia_threads.h index af7df23447c012..292c11f61d60d0 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -347,9 +347,9 @@ STATIC_INLINE int8_t jl_gc_state_set(jl_ptls_t ptls, int8_t state, int8_t old_state) { jl_atomic_store_release(&ptls->gc_state, state); - // A safe point is required if we transition from GC-safe region to - // non GC-safe region. - if (old_state && !state) + if (state == JL_GC_STATE_SAFE && old_state == 0) + jl_gc_safepoint_(ptls); + if (state == 0 && old_state == JL_GC_STATE_SAFE) jl_gc_safepoint_(ptls); return old_state; } diff --git a/src/safepoint.c b/src/safepoint.c index 51dae64e84e721..80490ada4e9e4a 100644 --- a/src/safepoint.c +++ b/src/safepoint.c @@ -44,7 +44,8 @@ uint16_t jl_safepoint_enable_cnt[4] = {0, 0, 0, 0}; // load/store so that threads waiting for the GC doesn't have to also // fight on the safepoint lock... uv_mutex_t safepoint_lock; -uv_cond_t safepoint_cond; +uv_cond_t safepoint_cond_begin; +uv_cond_t safepoint_cond_end; static void jl_safepoint_enable(int idx) JL_NOTSAFEPOINT { @@ -91,7 +92,8 @@ static void jl_safepoint_disable(int idx) JL_NOTSAFEPOINT void jl_safepoint_init(void) { uv_mutex_init(&safepoint_lock); - uv_cond_init(&safepoint_cond); + uv_cond_init(&safepoint_cond_begin); + uv_cond_init(&safepoint_cond_end); // jl_page_size isn't available yet. size_t pgsz = jl_getpagesize(); #ifdef _OS_WINDOWS_ @@ -124,6 +126,56 @@ void jl_safepoint_init(void) jl_safepoint_pages = addr; } +void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads) +{ + JL_TIMING(GC, GC_Stop); +#ifdef USE_TRACY + TracyCZoneCtx ctx = JL_TIMING_DEFAULT_BLOCK->tracy_ctx; + TracyCZoneColor(ctx, 0x696969); +#endif + assert(gc_n_threads); + if (gc_n_threads > 1) + jl_wake_libuv(); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 != NULL) { + // This acquire load pairs with the release stores + // in the signal handler of safepoint so we are sure that + // all the stores on those threads are visible. + // We're currently also using atomic store release in mutator threads + // (in jl_gc_state_set), but we may want to use signals to flush the + // memory operations on those threads lazily instead. + while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) + jl_cpu_pause(); // yield? + } + } +} + +void jl_gc_wait_for_the_world(void) +{ + assert(jl_n_threads); + if (jl_n_threads > 1) + jl_wake_libuv(); + for (int i = 0; i < jl_n_threads; i++) { + jl_ptls_t ptls2 = jl_all_tls_states[i]; + // This acquire load pairs with the release stores + // in the signal handler of safepoint so we are sure that + // all the stores on those threads are visible. + // We're currently also using atomic store release in mutator threads + // (in jl_gc_state_set), but we may want to use signals to flush the + // memory operations on those threads lazily instead. + while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) { + // Use system mutexes rather than spin locking to minimize wasted CPU time + // while we wait for other threads reach a safepoint. + // This is particularly important when run under rr. + uv_mutex_lock(&safepoint_lock); + if (!jl_atomic_load_relaxed(&ptls2->gc_state)) + uv_cond_wait(&safepoint_cond_begin, &safepoint_lock); + uv_mutex_unlock(&safepoint_lock); + } + } +} + int jl_safepoint_start_gc(void) { // The thread should have set this already @@ -169,7 +221,7 @@ void jl_safepoint_end_gc(void) jl_mach_gc_end(); # endif uv_mutex_unlock(&safepoint_lock); - uv_cond_broadcast(&safepoint_cond); + uv_cond_broadcast(&safepoint_cond_end); } // this is the core of jl_set_gc_and_wait @@ -179,6 +231,7 @@ void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct); // The thread should have set this is already assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) != 0); + uv_cond_broadcast(&safepoint_cond_begin); // Use normal volatile load in the loop for speed until GC finishes. // Then use an acquire load to make sure the GC result is visible on this thread. while (jl_atomic_load_relaxed(&jl_gc_running) || jl_atomic_load_acquire(&jl_gc_running)) { @@ -187,7 +240,7 @@ void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT // This is particularly important when run under rr. uv_mutex_lock(&safepoint_lock); if (jl_atomic_load_relaxed(&jl_gc_running)) - uv_cond_wait(&safepoint_cond, &safepoint_lock); + uv_cond_wait(&safepoint_cond_end, &safepoint_lock); uv_mutex_unlock(&safepoint_lock); } } diff --git a/src/signals-mach.c b/src/signals-mach.c index b7ab8ecaf8ef67..cce3af22b53f0e 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -48,6 +48,8 @@ extern uv_mutex_t safepoint_lock; // see jl_safepoint_wait_thread_resume void jl_safepoint_resume_thread_mach(jl_ptls_t ptls2, int16_t tid2) +extern uv_cond_t safepoint_cond_begin; +void jl_mach_gc_end(void) { // must be called with uv_mutex_lock(&safepoint_lock) and uv_mutex_lock(&ptls2->sleep_lock) held (in that order) for (size_t i = 0; i < suspended_threads.len; i++) { @@ -123,6 +125,8 @@ static void jl_mach_gc_wait(jl_ptls_t ptls2, mach_port_t thread, int16_t tid) if (relaxed_suspend_count) uv_mutex_unlock(&ptls2->sleep_lock); uv_mutex_unlock(&safepoint_lock); + uv_cond_broadcast(&safepoint_cond_begin); + return 1; } static mach_port_t segv_port = 0;