From e0568e2682549d91a77d9e9c41e045d892b7057e Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Tue, 12 Dec 2023 19:23:29 +0000 Subject: [PATCH] fixup! Reland "gc: avoid cpu stalls when starting" --- src/julia_internal.h | 12 +----- src/llvm-codegen-shared.h | 4 +- src/rtutils.c | 4 +- src/safepoint.c | 77 +++++++++++++++++++++------------------ src/signals-mach.c | 5 +-- 5 files changed, 49 insertions(+), 53 deletions(-) diff --git a/src/julia_internal.h b/src/julia_internal.h index 9105a291ac82b3..7f9cf06798ce09 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -966,17 +966,7 @@ JL_DLLEXPORT void jl_pgcstack_getkey(jl_get_pgcstack_func **f, jl_pgcstack_key_t extern pthread_mutex_t in_signal_lock; #endif -static inline void jl_set_gc_and_wait(void) // n.b. not used on _OS_DARWIN_ -{ - jl_task_t *ct = jl_current_task; - // reading own gc state doesn't need atomic ops since no one else - // should store to it. - int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state); - jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING); - jl_safepoint_wait_gc(); - jl_atomic_store_release(&ct->ptls->gc_state, state); - jl_safepoint_wait_thread_resume(); // block in thread-suspend now if requested, after clearing the gc_state -} +void jl_set_gc_and_wait(void); // n.b. not used on _OS_DARWIN_ // Query if a Julia object is if a permalloc region (due to part of a sys- pkg-image) STATIC_INLINE size_t n_linkage_blobs(void) JL_NOTSAFEPOINT diff --git a/src/llvm-codegen-shared.h b/src/llvm-codegen-shared.h index b78a3bea9c0ef7..a4f77bc1b3b381 100644 --- a/src/llvm-codegen-shared.h +++ b/src/llvm-codegen-shared.h @@ -304,8 +304,8 @@ static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::T BasicBlock *passBB = BasicBlock::Create(builder.getContext(), "safepoint", builder.GetInsertBlock()->getParent()); BasicBlock *exitBB = BasicBlock::Create(builder.getContext(), "after_safepoint", builder.GetInsertBlock()->getParent()); Constant *zero8 = ConstantInt::get(T_int8, 0); - builder.CreateCondBr(builder.CreateAnd(builder.CreateICmpNE(old_state, zero8), // if (old_state && !state) - builder.CreateICmpEQ(state, zero8)), + builder.CreateCondBr(builder.CreateOr(builder.CreateICmpEQ(old_state, zero8), // if (!old_state || !state) + builder.CreateICmpEQ(state, zero8)), passBB, exitBB); builder.SetInsertPoint(passBB); MDNode *tbaa = get_tbaa_const(builder.getContext()); diff --git a/src/rtutils.c b/src/rtutils.c index 5fbba42c15b023..74f66ae0b1769e 100644 --- a/src/rtutils.c +++ b/src/rtutils.c @@ -281,7 +281,6 @@ JL_DLLEXPORT void jl_eh_restore_state(jl_handler_t *eh) // This function should **NOT** have any safepoint before the ones at the // end. sig_atomic_t old_defer_signal = ct->ptls->defer_signal; - int8_t old_gc_state = jl_atomic_load_relaxed(&ct->ptls->gc_state); ct->eh = eh->prev; ct->gcstack = eh->gcstack; small_arraylist_t *locks = &ct->ptls->locks; @@ -293,9 +292,10 @@ JL_DLLEXPORT void jl_eh_restore_state(jl_handler_t *eh) } ct->world_age = eh->world_age; ct->ptls->defer_signal = eh->defer_signal; + int8_t old_gc_state = jl_atomic_load_relaxed(&ct->ptls->gc_state); if (old_gc_state != eh->gc_state) jl_atomic_store_release(&ct->ptls->gc_state, eh->gc_state); - if (!eh->gc_state) + if (!old_gc_state || !eh->gc_state) // it was or is unsafe now jl_gc_safepoint_(ct->ptls); if (old_defer_signal && !eh->defer_signal) jl_sigint_safepoint(ct->ptls); diff --git a/src/safepoint.c b/src/safepoint.c index 80490ada4e9e4a..a05d37e4928132 100644 --- a/src/safepoint.c +++ b/src/safepoint.c @@ -145,43 +145,32 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads) // We're currently also using atomic store release in mutator threads // (in jl_gc_state_set), but we may want to use signals to flush the // memory operations on those threads lazily instead. - while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) - jl_cpu_pause(); // yield? - } - } -} - -void jl_gc_wait_for_the_world(void) -{ - assert(jl_n_threads); - if (jl_n_threads > 1) - jl_wake_libuv(); - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; - // This acquire load pairs with the release stores - // in the signal handler of safepoint so we are sure that - // all the stores on those threads are visible. - // We're currently also using atomic store release in mutator threads - // (in jl_gc_state_set), but we may want to use signals to flush the - // memory operations on those threads lazily instead. - while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) { - // Use system mutexes rather than spin locking to minimize wasted CPU time - // while we wait for other threads reach a safepoint. - // This is particularly important when run under rr. - uv_mutex_lock(&safepoint_lock); - if (!jl_atomic_load_relaxed(&ptls2->gc_state)) - uv_cond_wait(&safepoint_cond_begin, &safepoint_lock); - uv_mutex_unlock(&safepoint_lock); + while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) { + // Use system mutexes rather than spin locking to minimize wasted CPU time + // while we wait for other threads reach a safepoint. + // This is particularly important when run under rr. + uv_mutex_lock(&safepoint_lock); + if (!jl_atomic_load_relaxed(&ptls2->gc_state)) + uv_cond_wait(&safepoint_cond_begin, &safepoint_lock); + uv_mutex_unlock(&safepoint_lock); + } } } } int jl_safepoint_start_gc(void) { - // The thread should have set this already + // The thread should have just set this before entry assert(jl_atomic_load_relaxed(&jl_current_task->ptls->gc_state) == JL_GC_STATE_WAITING); - jl_safepoint_wait_thread_resume(); // make sure we are permitted to run GC now (we might be required to stop instead) uv_mutex_lock(&safepoint_lock); + uv_cond_broadcast(&safepoint_cond_begin); + // make sure we are permitted to run GC now (we might be required to stop instead) + jl_task_t *ct = jl_current_task; + while (jl_atomic_load_relaxed(&ct->ptls->suspend_count)) { + uv_mutex_unlock(&safepoint_lock); + jl_safepoint_wait_thread_resume(); + uv_mutex_lock(&safepoint_lock); + } // In case multiple threads enter the GC at the same time, only allow // one of them to actually run the collection. We can't just let the // master thread do the GC since it might be running unmanaged code @@ -224,6 +213,21 @@ void jl_safepoint_end_gc(void) uv_cond_broadcast(&safepoint_cond_end); } +void jl_set_gc_and_wait(void) // n.b. not used on _OS_DARWIN_ +{ + jl_task_t *ct = jl_current_task; + // reading own gc state doesn't need atomic ops since no one else + // should store to it. + int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state); + jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING); + uv_mutex_lock(&safepoint_lock); + uv_cond_broadcast(&safepoint_cond_begin); + uv_mutex_unlock(&safepoint_lock); + jl_safepoint_wait_gc(); + jl_atomic_store_release(&ct->ptls->gc_state, state); + jl_safepoint_wait_thread_resume(); // block in thread-suspend now if requested, after clearing the gc_state +} + // this is the core of jl_set_gc_and_wait void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT { @@ -231,7 +235,6 @@ void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct); // The thread should have set this is already assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) != 0); - uv_cond_broadcast(&safepoint_cond_begin); // Use normal volatile load in the loop for speed until GC finishes. // Then use an acquire load to make sure the GC result is visible on this thread. while (jl_atomic_load_relaxed(&jl_gc_running) || jl_atomic_load_acquire(&jl_gc_running)) { @@ -257,6 +260,14 @@ void jl_safepoint_wait_thread_resume(void) int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state); jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING); uv_mutex_lock(&ct->ptls->sleep_lock); + if (jl_atomic_load_relaxed(&ct->ptls->suspend_count)) { + // defer this broadcast until we determine whether uv_cond_wait is really going to be needed + uv_mutex_unlock(&ct->ptls->sleep_lock); + uv_mutex_lock(&safepoint_lock); + uv_cond_broadcast(&safepoint_cond_begin); + uv_mutex_unlock(&safepoint_lock); + uv_mutex_lock(&ct->ptls->sleep_lock); + } while (jl_atomic_load_relaxed(&ct->ptls->suspend_count)) uv_cond_wait(&ct->ptls->wake_signal, &ct->ptls->sleep_lock); // must while still holding the mutex_unlock, so we know other threads in @@ -302,7 +313,7 @@ int jl_safepoint_suspend_thread(int tid, int waitstate) break; if (waitstate == 3 && state2 == JL_GC_STATE_WAITING) break; - jl_cpu_pause(); // yield? + jl_cpu_pause(); // yield (wait for safepoint_cond_begin, for example)? } } return suspend_count; @@ -315,9 +326,7 @@ int jl_safepoint_resume_thread(int tid) JL_NOTSAFEPOINT if (0 > tid || tid >= jl_atomic_load_acquire(&jl_n_threads)) return 0; jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; -# ifdef _OS_DARWIN_ uv_mutex_lock(&safepoint_lock); -# endif uv_mutex_lock(&ptls2->sleep_lock); int16_t suspend_count = jl_atomic_load_relaxed(&ptls2->suspend_count); if (suspend_count == 1) { // last to unsuspend @@ -336,9 +345,7 @@ int jl_safepoint_resume_thread(int tid) JL_NOTSAFEPOINT jl_safepoint_disable(3); } uv_mutex_unlock(&ptls2->sleep_lock); -# ifdef _OS_DARWIN_ uv_mutex_unlock(&safepoint_lock); -# endif return suspend_count; } diff --git a/src/signals-mach.c b/src/signals-mach.c index cce3af22b53f0e..48e124b54cb61c 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -45,11 +45,10 @@ static void attach_exception_port(thread_port_t thread, int segv_only); // low 16 bits are the thread id, the next 8 bits are the original gc_state static arraylist_t suspended_threads; extern uv_mutex_t safepoint_lock; +extern uv_cond_t safepoint_cond_begin; // see jl_safepoint_wait_thread_resume void jl_safepoint_resume_thread_mach(jl_ptls_t ptls2, int16_t tid2) -extern uv_cond_t safepoint_cond_begin; -void jl_mach_gc_end(void) { // must be called with uv_mutex_lock(&safepoint_lock) and uv_mutex_lock(&ptls2->sleep_lock) held (in that order) for (size_t i = 0; i < suspended_threads.len; i++) { @@ -124,8 +123,8 @@ static void jl_mach_gc_wait(jl_ptls_t ptls2, mach_port_t thread, int16_t tid) } if (relaxed_suspend_count) uv_mutex_unlock(&ptls2->sleep_lock); - uv_mutex_unlock(&safepoint_lock); uv_cond_broadcast(&safepoint_cond_begin); + uv_mutex_unlock(&safepoint_lock); return 1; }