Skip to content

Commit

Permalink
Reland "gc: avoid cpu stalls when starting"
Browse files Browse the repository at this point in the history
This reverts commit 4801b6c and adds
the safepoints needed to catch the unsafe->safe transition also.
  • Loading branch information
vtjnash committed Dec 12, 2023
1 parent eba10dd commit bf0cd8c
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 34 deletions.
1 change: 0 additions & 1 deletion src/gc-debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ void add_lostval_parent(jl_value_t *parent)
innocent looking functions which allocate (and thus trigger marking) only on special cases.
If you can't find it, you can try the following :
- Ensure that should_timeout() is deterministic instead of clock based.
- Once you have a completely deterministic program which crashes on gc_verify, the addresses
should stay constant between different runs (with same binary, same environment ...).
Do not forget to turn off ASLR (linux: echo 0 > /proc/sys/kernel/randomize_va_space).
Expand Down
27 changes: 1 addition & 26 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -222,32 +222,7 @@ NOINLINE uintptr_t gc_get_stack_ptr(void)
return (uintptr_t)jl_get_frame_addr();
}

#define should_timeout() 0

void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
{
JL_TIMING(GC, GC_Stop);
#ifdef USE_TRACY
TracyCZoneCtx ctx = JL_TIMING_DEFAULT_BLOCK->tracy_ctx;
TracyCZoneColor(ctx, 0x696969);
#endif
assert(gc_n_threads);
if (gc_n_threads > 1)
jl_wake_libuv();
for (int i = 0; i < gc_n_threads; i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
if (ptls2 != NULL) {
// This acquire load pairs with the release stores
// in the signal handler of safepoint so we are sure that
// all the stores on those threads are visible.
// We're currently also using atomic store release in mutator threads
// (in jl_gc_state_set), but we may want to use signals to flush the
// memory operations on those threads lazily instead.
while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
jl_cpu_pause(); // yield?
}
}
}
void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads);

// malloc wrappers, aligned allocation

Expand Down
6 changes: 3 additions & 3 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -347,9 +347,9 @@ STATIC_INLINE int8_t jl_gc_state_set(jl_ptls_t ptls, int8_t state,
int8_t old_state)
{
jl_atomic_store_release(&ptls->gc_state, state);
// A safe point is required if we transition from GC-safe region to
// non GC-safe region.
if (old_state && !state)
if (state == JL_GC_STATE_SAFE && old_state == 0)
jl_gc_safepoint_(ptls);
if (state == 0 && old_state == JL_GC_STATE_SAFE)
jl_gc_safepoint_(ptls);
return old_state;
}
Expand Down
61 changes: 57 additions & 4 deletions src/safepoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ uint16_t jl_safepoint_enable_cnt[4] = {0, 0, 0, 0};
// load/store so that threads waiting for the GC doesn't have to also
// fight on the safepoint lock...
uv_mutex_t safepoint_lock;
uv_cond_t safepoint_cond;
uv_cond_t safepoint_cond_begin;
uv_cond_t safepoint_cond_end;

static void jl_safepoint_enable(int idx) JL_NOTSAFEPOINT
{
Expand Down Expand Up @@ -91,7 +92,8 @@ static void jl_safepoint_disable(int idx) JL_NOTSAFEPOINT
void jl_safepoint_init(void)
{
uv_mutex_init(&safepoint_lock);
uv_cond_init(&safepoint_cond);
uv_cond_init(&safepoint_cond_begin);
uv_cond_init(&safepoint_cond_end);
// jl_page_size isn't available yet.
size_t pgsz = jl_getpagesize();
#ifdef _OS_WINDOWS_
Expand Down Expand Up @@ -124,6 +126,56 @@ void jl_safepoint_init(void)
jl_safepoint_pages = addr;
}

void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
{
JL_TIMING(GC, GC_Stop);
#ifdef USE_TRACY
TracyCZoneCtx ctx = JL_TIMING_DEFAULT_BLOCK->tracy_ctx;
TracyCZoneColor(ctx, 0x696969);
#endif
assert(gc_n_threads);
if (gc_n_threads > 1)
jl_wake_libuv();
for (int i = 0; i < gc_n_threads; i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
if (ptls2 != NULL) {
// This acquire load pairs with the release stores
// in the signal handler of safepoint so we are sure that
// all the stores on those threads are visible.
// We're currently also using atomic store release in mutator threads
// (in jl_gc_state_set), but we may want to use signals to flush the
// memory operations on those threads lazily instead.
while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
jl_cpu_pause(); // yield?
}
}
}

void jl_gc_wait_for_the_world(void)
{
assert(jl_n_threads);
if (jl_n_threads > 1)
jl_wake_libuv();
for (int i = 0; i < jl_n_threads; i++) {
jl_ptls_t ptls2 = jl_all_tls_states[i];
// This acquire load pairs with the release stores
// in the signal handler of safepoint so we are sure that
// all the stores on those threads are visible.
// We're currently also using atomic store release in mutator threads
// (in jl_gc_state_set), but we may want to use signals to flush the
// memory operations on those threads lazily instead.
while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) {
// Use system mutexes rather than spin locking to minimize wasted CPU time
// while we wait for other threads reach a safepoint.
// This is particularly important when run under rr.
uv_mutex_lock(&safepoint_lock);
if (!jl_atomic_load_relaxed(&ptls2->gc_state))
uv_cond_wait(&safepoint_cond_begin, &safepoint_lock);
uv_mutex_unlock(&safepoint_lock);
}
}
}

int jl_safepoint_start_gc(void)
{
// The thread should have set this already
Expand Down Expand Up @@ -169,7 +221,7 @@ void jl_safepoint_end_gc(void)
jl_mach_gc_end();
# endif
uv_mutex_unlock(&safepoint_lock);
uv_cond_broadcast(&safepoint_cond);
uv_cond_broadcast(&safepoint_cond_end);
}

// this is the core of jl_set_gc_and_wait
Expand All @@ -179,6 +231,7 @@ void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT
JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct);
// The thread should have set this is already
assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) != 0);
uv_cond_broadcast(&safepoint_cond_begin);
// Use normal volatile load in the loop for speed until GC finishes.
// Then use an acquire load to make sure the GC result is visible on this thread.
while (jl_atomic_load_relaxed(&jl_gc_running) || jl_atomic_load_acquire(&jl_gc_running)) {
Expand All @@ -187,7 +240,7 @@ void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT
// This is particularly important when run under rr.
uv_mutex_lock(&safepoint_lock);
if (jl_atomic_load_relaxed(&jl_gc_running))
uv_cond_wait(&safepoint_cond, &safepoint_lock);
uv_cond_wait(&safepoint_cond_end, &safepoint_lock);
uv_mutex_unlock(&safepoint_lock);
}
}
Expand Down
4 changes: 4 additions & 0 deletions src/signals-mach.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ extern uv_mutex_t safepoint_lock;

// see jl_safepoint_wait_thread_resume
void jl_safepoint_resume_thread_mach(jl_ptls_t ptls2, int16_t tid2)
extern uv_cond_t safepoint_cond_begin;
void jl_mach_gc_end(void)
{
// must be called with uv_mutex_lock(&safepoint_lock) and uv_mutex_lock(&ptls2->sleep_lock) held (in that order)
for (size_t i = 0; i < suspended_threads.len; i++) {
Expand Down Expand Up @@ -123,6 +125,8 @@ static void jl_mach_gc_wait(jl_ptls_t ptls2, mach_port_t thread, int16_t tid)
if (relaxed_suspend_count)
uv_mutex_unlock(&ptls2->sleep_lock);
uv_mutex_unlock(&safepoint_lock);
uv_cond_broadcast(&safepoint_cond_begin);
return 1;
}

static mach_port_t segv_port = 0;
Expand Down

0 comments on commit bf0cd8c

Please sign in to comment.