Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reland "gc: avoid cpu stalls when starting" #45794

Merged
merged 3 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/gc-debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ void add_lostval_parent(jl_value_t *parent)
innocent looking functions which allocate (and thus trigger marking) only on special cases.

If you can't find it, you can try the following :
- Ensure that should_timeout() is deterministic instead of clock based.
- Once you have a completely deterministic program which crashes on gc_verify, the addresses
should stay constant between different runs (with same binary, same environment ...).
Do not forget to turn off ASLR (linux: echo 0 > /proc/sys/kernel/randomize_va_space).
Expand Down
27 changes: 1 addition & 26 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -222,32 +222,7 @@ NOINLINE uintptr_t gc_get_stack_ptr(void)
return (uintptr_t)jl_get_frame_addr();
}

#define should_timeout() 0

void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
{
JL_TIMING(GC, GC_Stop);
#ifdef USE_TRACY
TracyCZoneCtx ctx = JL_TIMING_DEFAULT_BLOCK->tracy_ctx;
TracyCZoneColor(ctx, 0x696969);
#endif
assert(gc_n_threads);
if (gc_n_threads > 1)
jl_wake_libuv();
for (int i = 0; i < gc_n_threads; i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
if (ptls2 != NULL) {
// This acquire load pairs with the release stores
// in the signal handler of safepoint so we are sure that
// all the stores on those threads are visible.
// We're currently also using atomic store release in mutator threads
// (in jl_gc_state_set), but we may want to use signals to flush the
// memory operations on those threads lazily instead.
while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
jl_cpu_pause(); // yield?
}
}
}
void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads);

// malloc wrappers, aligned allocation

Expand Down
12 changes: 1 addition & 11 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -966,17 +966,7 @@ JL_DLLEXPORT void jl_pgcstack_getkey(jl_get_pgcstack_func **f, jl_pgcstack_key_t
extern pthread_mutex_t in_signal_lock;
#endif

static inline void jl_set_gc_and_wait(void) // n.b. not used on _OS_DARWIN_
{
jl_task_t *ct = jl_current_task;
// reading own gc state doesn't need atomic ops since no one else
// should store to it.
int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING);
jl_safepoint_wait_gc();
jl_atomic_store_release(&ct->ptls->gc_state, state);
jl_safepoint_wait_thread_resume(); // block in thread-suspend now if requested, after clearing the gc_state
}
void jl_set_gc_and_wait(void); // n.b. not used on _OS_DARWIN_

// Query if a Julia object is if a permalloc region (due to part of a sys- pkg-image)
STATIC_INLINE size_t n_linkage_blobs(void) JL_NOTSAFEPOINT
Expand Down
6 changes: 3 additions & 3 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -347,9 +347,9 @@ STATIC_INLINE int8_t jl_gc_state_set(jl_ptls_t ptls, int8_t state,
int8_t old_state)
{
jl_atomic_store_release(&ptls->gc_state, state);
// A safe point is required if we transition from GC-safe region to
// non GC-safe region.
if (old_state && !state)
if (state == JL_GC_STATE_SAFE && old_state == 0)
jl_gc_safepoint_(ptls);
if (state == 0 && old_state == JL_GC_STATE_SAFE)
jl_gc_safepoint_(ptls);
return old_state;
}
Expand Down
4 changes: 2 additions & 2 deletions src/llvm-codegen-shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,8 @@ static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::T
BasicBlock *passBB = BasicBlock::Create(builder.getContext(), "safepoint", builder.GetInsertBlock()->getParent());
BasicBlock *exitBB = BasicBlock::Create(builder.getContext(), "after_safepoint", builder.GetInsertBlock()->getParent());
Constant *zero8 = ConstantInt::get(T_int8, 0);
builder.CreateCondBr(builder.CreateAnd(builder.CreateICmpNE(old_state, zero8), // if (old_state && !state)
builder.CreateICmpEQ(state, zero8)),
builder.CreateCondBr(builder.CreateOr(builder.CreateICmpEQ(old_state, zero8), // if (!old_state || !state)
builder.CreateICmpEQ(state, zero8)),
passBB, exitBB);
builder.SetInsertPoint(passBB);
MDNode *tbaa = get_tbaa_const(builder.getContext());
Expand Down
4 changes: 2 additions & 2 deletions src/rtutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,6 @@ JL_DLLEXPORT void jl_eh_restore_state(jl_handler_t *eh)
// This function should **NOT** have any safepoint before the ones at the
// end.
sig_atomic_t old_defer_signal = ct->ptls->defer_signal;
int8_t old_gc_state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
ct->eh = eh->prev;
ct->gcstack = eh->gcstack;
small_arraylist_t *locks = &ct->ptls->locks;
Expand All @@ -293,9 +292,10 @@ JL_DLLEXPORT void jl_eh_restore_state(jl_handler_t *eh)
}
ct->world_age = eh->world_age;
ct->ptls->defer_signal = eh->defer_signal;
int8_t old_gc_state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
if (old_gc_state != eh->gc_state)
jl_atomic_store_release(&ct->ptls->gc_state, eh->gc_state);
if (!eh->gc_state)
if (!old_gc_state || !eh->gc_state) // it was or is unsafe now
jl_gc_safepoint_(ct->ptls);
if (old_defer_signal && !eh->defer_signal)
jl_sigint_safepoint(ct->ptls);
Expand Down
82 changes: 71 additions & 11 deletions src/safepoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ uint16_t jl_safepoint_enable_cnt[4] = {0, 0, 0, 0};
// load/store so that threads waiting for the GC doesn't have to also
// fight on the safepoint lock...
uv_mutex_t safepoint_lock;
uv_cond_t safepoint_cond;
uv_cond_t safepoint_cond_begin;
uv_cond_t safepoint_cond_end;

static void jl_safepoint_enable(int idx) JL_NOTSAFEPOINT
{
Expand Down Expand Up @@ -91,7 +92,8 @@ static void jl_safepoint_disable(int idx) JL_NOTSAFEPOINT
void jl_safepoint_init(void)
{
uv_mutex_init(&safepoint_lock);
uv_cond_init(&safepoint_cond);
uv_cond_init(&safepoint_cond_begin);
uv_cond_init(&safepoint_cond_end);
// jl_page_size isn't available yet.
size_t pgsz = jl_getpagesize();
#ifdef _OS_WINDOWS_
Expand Down Expand Up @@ -124,12 +126,51 @@ void jl_safepoint_init(void)
jl_safepoint_pages = addr;
}

void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
{
JL_TIMING(GC, GC_Stop);
#ifdef USE_TRACY
TracyCZoneCtx ctx = JL_TIMING_DEFAULT_BLOCK->tracy_ctx;
TracyCZoneColor(ctx, 0x696969);
#endif
assert(gc_n_threads);
if (gc_n_threads > 1)
jl_wake_libuv();
for (int i = 0; i < gc_n_threads; i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
if (ptls2 != NULL) {
// This acquire load pairs with the release stores
// in the signal handler of safepoint so we are sure that
// all the stores on those threads are visible.
// We're currently also using atomic store release in mutator threads
// (in jl_gc_state_set), but we may want to use signals to flush the
// memory operations on those threads lazily instead.
while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) {
// Use system mutexes rather than spin locking to minimize wasted CPU time
// while we wait for other threads reach a safepoint.
// This is particularly important when run under rr.
uv_mutex_lock(&safepoint_lock);
if (!jl_atomic_load_relaxed(&ptls2->gc_state))
uv_cond_wait(&safepoint_cond_begin, &safepoint_lock);
uv_mutex_unlock(&safepoint_lock);
}
}
}
}

int jl_safepoint_start_gc(void)
{
// The thread should have set this already
// The thread should have just set this before entry
assert(jl_atomic_load_relaxed(&jl_current_task->ptls->gc_state) == JL_GC_STATE_WAITING);
jl_safepoint_wait_thread_resume(); // make sure we are permitted to run GC now (we might be required to stop instead)
uv_mutex_lock(&safepoint_lock);
uv_cond_broadcast(&safepoint_cond_begin);
// make sure we are permitted to run GC now (we might be required to stop instead)
jl_task_t *ct = jl_current_task;
while (jl_atomic_load_relaxed(&ct->ptls->suspend_count)) {
uv_mutex_unlock(&safepoint_lock);
jl_safepoint_wait_thread_resume();
uv_mutex_lock(&safepoint_lock);
}
// In case multiple threads enter the GC at the same time, only allow
// one of them to actually run the collection. We can't just let the
// master thread do the GC since it might be running unmanaged code
Expand Down Expand Up @@ -169,7 +210,22 @@ void jl_safepoint_end_gc(void)
jl_mach_gc_end();
# endif
uv_mutex_unlock(&safepoint_lock);
uv_cond_broadcast(&safepoint_cond);
uv_cond_broadcast(&safepoint_cond_end);
}

void jl_set_gc_and_wait(void) // n.b. not used on _OS_DARWIN_
{
jl_task_t *ct = jl_current_task;
// reading own gc state doesn't need atomic ops since no one else
// should store to it.
int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING);
uv_mutex_lock(&safepoint_lock);
uv_cond_broadcast(&safepoint_cond_begin);
uv_mutex_unlock(&safepoint_lock);
jl_safepoint_wait_gc();
jl_atomic_store_release(&ct->ptls->gc_state, state);
jl_safepoint_wait_thread_resume(); // block in thread-suspend now if requested, after clearing the gc_state
}

// this is the core of jl_set_gc_and_wait
Expand All @@ -187,7 +243,7 @@ void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT
// This is particularly important when run under rr.
uv_mutex_lock(&safepoint_lock);
if (jl_atomic_load_relaxed(&jl_gc_running))
uv_cond_wait(&safepoint_cond, &safepoint_lock);
uv_cond_wait(&safepoint_cond_end, &safepoint_lock);
uv_mutex_unlock(&safepoint_lock);
}
}
Expand All @@ -204,6 +260,14 @@ void jl_safepoint_wait_thread_resume(void)
int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING);
uv_mutex_lock(&ct->ptls->sleep_lock);
if (jl_atomic_load_relaxed(&ct->ptls->suspend_count)) {
// defer this broadcast until we determine whether uv_cond_wait is really going to be needed
uv_mutex_unlock(&ct->ptls->sleep_lock);
uv_mutex_lock(&safepoint_lock);
uv_cond_broadcast(&safepoint_cond_begin);
uv_mutex_unlock(&safepoint_lock);
uv_mutex_lock(&ct->ptls->sleep_lock);
}
while (jl_atomic_load_relaxed(&ct->ptls->suspend_count))
uv_cond_wait(&ct->ptls->wake_signal, &ct->ptls->sleep_lock);
// must while still holding the mutex_unlock, so we know other threads in
Expand Down Expand Up @@ -249,7 +313,7 @@ int jl_safepoint_suspend_thread(int tid, int waitstate)
break;
if (waitstate == 3 && state2 == JL_GC_STATE_WAITING)
break;
jl_cpu_pause(); // yield?
jl_cpu_pause(); // yield (wait for safepoint_cond_begin, for example)?
}
}
return suspend_count;
Expand All @@ -262,9 +326,7 @@ int jl_safepoint_resume_thread(int tid) JL_NOTSAFEPOINT
if (0 > tid || tid >= jl_atomic_load_acquire(&jl_n_threads))
return 0;
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
# ifdef _OS_DARWIN_
uv_mutex_lock(&safepoint_lock);
# endif
uv_mutex_lock(&ptls2->sleep_lock);
int16_t suspend_count = jl_atomic_load_relaxed(&ptls2->suspend_count);
if (suspend_count == 1) { // last to unsuspend
Expand All @@ -283,9 +345,7 @@ int jl_safepoint_resume_thread(int tid) JL_NOTSAFEPOINT
jl_safepoint_disable(3);
}
uv_mutex_unlock(&ptls2->sleep_lock);
# ifdef _OS_DARWIN_
uv_mutex_unlock(&safepoint_lock);
# endif
return suspend_count;
}

Expand Down
28 changes: 23 additions & 5 deletions src/scheduler.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ static const int16_t sleeping_like_the_dead JL_UNUSED = 2;
// a running count of how many threads are currently not_sleeping
// plus a running count of the number of in-flight wake-ups
// n.b. this may temporarily exceed jl_n_threads
static _Atomic(int) nrunning = 1;
static _Atomic(int) nrunning = 0;

// invariant: No thread is ever asleep unless sleep_check_state is sleeping (or we have a wakeup signal pending).
// invariant: Any particular thread is not asleep unless that thread's sleep_check_state is sleeping.
Expand Down Expand Up @@ -201,6 +201,20 @@ void jl_threadfun(void *arg)
}



void jl_init_thread_scheduler(jl_ptls_t ptls) JL_NOTSAFEPOINT
{
uv_mutex_init(&ptls->sleep_lock);
uv_cond_init(&ptls->wake_signal);
// record that there is now another thread that may be used to schedule work
// we will decrement this again in scheduler_delete_thread, only slightly
// in advance of pthread_join (which hopefully itself also had been
// adopted by now and is included in nrunning too)
(void)jl_atomic_fetch_add_relaxed(&nrunning, 1);
// n.b. this is the only point in the code where we ignore the invariants on the ordering of nrunning
// since we are being initialized from foreign code, we could not necessarily have expected or predicted that to happen
}

int jl_running_under_rr(int recheck)
{
#ifdef _OS_LINUX_
Expand Down Expand Up @@ -581,9 +595,10 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,

void scheduler_delete_thread(jl_ptls_t ptls) JL_NOTSAFEPOINT
{
if (jl_atomic_exchange_relaxed(&ptls->sleep_check_state, sleeping_like_the_dead) != sleeping) {
int wasrunning = jl_atomic_fetch_add_relaxed(&nrunning, -1);
vtjnash marked this conversation as resolved.
Show resolved Hide resolved
if (wasrunning == 1) {
int notsleeping = jl_atomic_exchange_relaxed(&ptls->sleep_check_state, sleeping_like_the_dead) == not_sleeping;
jl_fence();
if (notsleeping) {
if (jl_atomic_load_relaxed(&nrunning) == 1) {
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
// This was the last running thread, and there is no thread with !may_sleep
// so make sure tid 0 is notified to check wait_empty
Expand All @@ -592,8 +607,11 @@ void scheduler_delete_thread(jl_ptls_t ptls) JL_NOTSAFEPOINT
uv_mutex_unlock(&ptls2->sleep_lock);
}
}
jl_fence();
else {
jl_atomic_fetch_add_relaxed(&nrunning, 1);
}
jl_wakeup_thread(0); // force thread 0 to see that we do not have the IO lock (and am dead)
jl_atomic_fetch_add_relaxed(&nrunning, -1);
}

#ifdef __cplusplus
Expand Down
2 changes: 2 additions & 0 deletions src/signals-mach.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ static void attach_exception_port(thread_port_t thread, int segv_only);
// low 16 bits are the thread id, the next 8 bits are the original gc_state
static arraylist_t suspended_threads;
extern uv_mutex_t safepoint_lock;
extern uv_cond_t safepoint_cond_begin;

// see jl_safepoint_wait_thread_resume
void jl_safepoint_resume_thread_mach(jl_ptls_t ptls2, int16_t tid2)
Expand Down Expand Up @@ -122,6 +123,7 @@ static void jl_mach_gc_wait(jl_ptls_t ptls2, mach_port_t thread, int16_t tid)
}
if (relaxed_suspend_count)
uv_mutex_unlock(&ptls2->sleep_lock);
uv_cond_broadcast(&safepoint_cond_begin);
uv_mutex_unlock(&safepoint_lock);
}

Expand Down
5 changes: 2 additions & 3 deletions src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ int jl_all_tls_states_size;
static uv_cond_t cond;
// concurrent reads are permitted, using the same pattern as mtsmall_arraylist
// it is implemented separately because the API of direct jl_all_tls_states use is already widely prevalent
void jl_init_thread_scheduler(jl_ptls_t ptls) JL_NOTSAFEPOINT;

// return calling thread's ID
JL_DLLEXPORT int16_t jl_threadid(void)
Expand Down Expand Up @@ -379,9 +380,7 @@ jl_ptls_t jl_init_threadtls(int16_t tid)
ptls->bt_data = bt_data;
small_arraylist_new(&ptls->locks, 0);
jl_init_thread_heap(ptls);

uv_mutex_init(&ptls->sleep_lock);
uv_cond_init(&ptls->wake_signal);
jl_init_thread_scheduler(ptls);

uv_mutex_lock(&tls_lock);
if (tid == -1)
Expand Down