Skip to content

Commit

Permalink
GC scheduler refinements (#52294)
Browse files Browse the repository at this point in the history
Supersedes #51061 and
#51414.

Still needs more perf analysis.

(cherry picked from commit e26c257)
  • Loading branch information
d-netto authored and KristofferC committed Dec 12, 2023
1 parent 6627b91 commit 4241d4c
Showing 2 changed files with 101 additions and 29 deletions.
117 changes: 101 additions & 16 deletions src/gc.c
Original file line number Diff line number Diff line change
@@ -27,6 +27,8 @@ uv_mutex_t gc_threads_lock;
uv_cond_t gc_threads_cond;
// To indicate whether concurrent sweeping should run
uv_sem_t gc_sweep_assists_needed;
// Mutex used to coordinate entry of GC threads in the mark loop
uv_mutex_t gc_queue_observer_lock;

// Linked list of callback functions

@@ -2706,8 +2708,10 @@ void gc_mark_and_steal(jl_ptls_t ptls)
jl_gc_markqueue_t *mq = &ptls->mark_queue;
jl_gc_markqueue_t *mq_master = NULL;
int master_tid = jl_atomic_load(&gc_master_tid);
if (master_tid != -1)
mq_master = &gc_all_tls_states[master_tid]->mark_queue;
if (master_tid == -1) {
return;
}
mq_master = &gc_all_tls_states[master_tid]->mark_queue;
void *new_obj;
jl_gc_chunk_t c;
pop : {
@@ -2782,28 +2786,108 @@ void gc_mark_and_steal(jl_ptls_t ptls)
}
}

size_t gc_count_work_in_queue(jl_ptls_t ptls) JL_NOTSAFEPOINT
{
// assume each chunk is worth 256 units of work and each pointer
// is worth 1 unit of work
size_t work = 256 * (jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.bottom) -
jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.top));
work += (jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.bottom) -
jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.top));
return work;
}

/**
* Correctness argument for the mark-loop termination protocol.
*
* Safety properties:
* - No work items shall be in any thread's queues when `gc_mark_loop_barrier` observes
* that `gc_n_threads_marking` is zero.
*
* - No work item shall be stolen from the master thread (i.e. mutator thread which started
* GC and which helped the `jl_n_markthreads` - 1 threads to mark) after
* `gc_mark_loop_barrier` observes that `gc_n_threads_marking` is zero. This property is
* necessary because we call `gc_mark_loop_serial` after marking the finalizer list in
* `_jl_gc_collect`, and want to ensure that we have the serial mark-loop semantics there,
* and that no work is stolen from us at that point.
*
* Proof:
* - Suppose the master thread observes that `gc_n_threads_marking` is zero in
* `gc_mark_loop_barrier` and there is a work item left in one thread's queue at that point.
* Since threads try to steal from all threads' queues, this implies that all threads must
* have tried to steal from the queue which still has a work item left, but failed to do so,
* which violates the semantics of Chase-Lev's work-stealing queue.
*
* - Let E1 be the event "master thread writes -1 to gc_master_tid" and E2 be the even
* "master thread observes that `gc_n_threads_marking` is zero". Since we're using
* sequentially consistent atomics, E1 => E2. Now suppose one thread which is spinning in
* `gc_should_mark` tries to enter the mark-loop after E2. In order to do so, it must
* increment `gc_n_threads_marking` to 1 in an event E3, and then read `gc_master_tid` in an
* event E4. Since we're using sequentially consistent atomics, E3 => E4. Since we observed
* `gc_n_threads_marking` as zero in E2, then E2 => E3, and we conclude E1 => E4, so that
* the thread which is spinning in `gc_should_mark` must observe that `gc_master_tid` is -1
* and therefore won't enter the mark-loop.
*/

int gc_should_mark(jl_ptls_t ptls)
{
int should_mark = 0;
int n_threads_marking = jl_atomic_load(&gc_n_threads_marking);
// fast path
if (n_threads_marking == 0) {
return 0;
}
uv_mutex_lock(&gc_queue_observer_lock);
while (1) {
int tid = jl_atomic_load(&gc_master_tid);
// fast path
if (tid == -1) {
break;
}
n_threads_marking = jl_atomic_load(&gc_n_threads_marking);
// fast path
if (n_threads_marking == 0) {
break;
}
size_t work = gc_count_work_in_queue(gc_all_tls_states[tid]);
for (tid = gc_first_tid; tid < gc_first_tid + jl_n_markthreads; tid++) {
work += gc_count_work_in_queue(gc_all_tls_states[tid]);
}
// if there is a lot of work left, enter the mark loop
if (work >= 16 * n_threads_marking) {
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
should_mark = 1;
break;
}
jl_cpu_pause();
}
uv_mutex_unlock(&gc_queue_observer_lock);
return should_mark;
}

void gc_wake_all_for_marking(jl_ptls_t ptls)
{
jl_atomic_store(&gc_master_tid, ptls->tid);
uv_mutex_lock(&gc_threads_lock);
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
uv_cond_broadcast(&gc_threads_cond);
uv_mutex_unlock(&gc_threads_lock);
}

void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
{
int backoff = GC_BACKOFF_MIN;
if (master) {
jl_atomic_store(&gc_master_tid, ptls->tid);
// Wake threads up and try to do some work
uv_mutex_lock(&gc_threads_lock);
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
uv_cond_broadcast(&gc_threads_cond);
uv_mutex_unlock(&gc_threads_lock);
gc_wake_all_for_marking(ptls);
gc_mark_and_steal(ptls);
jl_atomic_fetch_add(&gc_n_threads_marking, -1);
}
while (jl_atomic_load(&gc_n_threads_marking) > 0) {
// Try to become a thief while other threads are marking
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
if (jl_atomic_load(&gc_master_tid) != -1) {
gc_mark_and_steal(ptls);
while (1) {
int should_mark = gc_should_mark(ptls);
if (!should_mark) {
break;
}
gc_mark_and_steal(ptls);
jl_atomic_fetch_add(&gc_n_threads_marking, -1);
// Failed to steal
gc_backoff(&backoff);
}
}

@@ -3543,6 +3627,7 @@ void jl_gc_init(void)
uv_mutex_init(&gc_threads_lock);
uv_cond_init(&gc_threads_cond);
uv_sem_init(&gc_sweep_assists_needed, 0);
uv_mutex_init(&gc_queue_observer_lock);

jl_gc_init_page();
jl_gc_debug_init();
13 changes: 0 additions & 13 deletions src/gc.h
Original file line number Diff line number Diff line change
@@ -187,19 +187,6 @@ extern jl_gc_global_page_pool_t global_page_pool_lazily_freed;
extern jl_gc_global_page_pool_t global_page_pool_clean;
extern jl_gc_global_page_pool_t global_page_pool_freed;

#define GC_BACKOFF_MIN 4
#define GC_BACKOFF_MAX 12

STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT
{
if (*i < GC_BACKOFF_MAX) {
(*i)++;
}
for (int j = 0; j < (1 << *i); j++) {
jl_cpu_pause();
}
}

// Lock-free stack implementation taken
// from Herlihy's "The Art of Multiprocessor Programming"

0 comments on commit 4241d4c

Please sign in to comment.