JuliaLang · vtjnash · Dec 14, 2023 · Dec 12, 2023 · Dec 12, 2023 · Dec 13, 2023
diff --git a/src/gc-debug.c b/src/gc-debug.c
@@ -83,7 +83,6 @@ void add_lostval_parent(jl_value_t *parent)
  innocent looking functions which allocate (and thus trigger marking) only on special cases.
 
  If you can't find it, you can try the following :
- - Ensure that should_timeout() is deterministic instead of clock based.
  - Once you have a completely deterministic program which crashes on gc_verify, the addresses
    should stay constant between different runs (with same binary, same environment ...).
    Do not forget to turn off ASLR (linux: echo 0 > /proc/sys/kernel/randomize_va_space).

diff --git a/src/gc.c b/src/gc.c
@@ -222,32 +222,7 @@ NOINLINE uintptr_t gc_get_stack_ptr(void)
     return (uintptr_t)jl_get_frame_addr();
 }
 
-#define should_timeout() 0
-
-void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
-{
-    JL_TIMING(GC, GC_Stop);
-#ifdef USE_TRACY
-    TracyCZoneCtx ctx = JL_TIMING_DEFAULT_BLOCK->tracy_ctx;
-    TracyCZoneColor(ctx, 0x696969);
-#endif
-    assert(gc_n_threads);
-    if (gc_n_threads > 1)
-        jl_wake_libuv();
-    for (int i = 0; i < gc_n_threads; i++) {
-        jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 != NULL) {
-            // This acquire load pairs with the release stores
-            // in the signal handler of safepoint so we are sure that
-            // all the stores on those threads are visible.
-            // We're currently also using atomic store release in mutator threads
-            // (in jl_gc_state_set), but we may want to use signals to flush the
-            // memory operations on those threads lazily instead.
-            while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
-                jl_cpu_pause(); // yield?
-        }
-    }
-}
+void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads);
 
 // malloc wrappers, aligned allocation
 

diff --git a/src/julia_internal.h b/src/julia_internal.h
@@ -966,17 +966,7 @@ JL_DLLEXPORT void jl_pgcstack_getkey(jl_get_pgcstack_func **f, jl_pgcstack_key_t
 extern pthread_mutex_t in_signal_lock;
 #endif
 
-static inline void jl_set_gc_and_wait(void) // n.b. not used on _OS_DARWIN_
-{
-    jl_task_t *ct = jl_current_task;
-    // reading own gc state doesn't need atomic ops since no one else
-    // should store to it.
-    int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
-    jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING);
-    jl_safepoint_wait_gc();
-    jl_atomic_store_release(&ct->ptls->gc_state, state);
-    jl_safepoint_wait_thread_resume(); // block in thread-suspend now if requested, after clearing the gc_state
-}
+void jl_set_gc_and_wait(void); // n.b. not used on _OS_DARWIN_
 
 // Query if a Julia object is if a permalloc region (due to part of a sys- pkg-image)
 STATIC_INLINE size_t n_linkage_blobs(void) JL_NOTSAFEPOINT

diff --git a/src/julia_threads.h b/src/julia_threads.h
@@ -347,9 +347,9 @@ STATIC_INLINE int8_t jl_gc_state_set(jl_ptls_t ptls, int8_t state,
                                      int8_t old_state)
 {
     jl_atomic_store_release(&ptls->gc_state, state);
-    // A safe point is required if we transition from GC-safe region to
-    // non GC-safe region.
-    if (old_state && !state)
+    if (state == JL_GC_STATE_SAFE && old_state == 0)
+        jl_gc_safepoint_(ptls);
+    if (state == 0 && old_state == JL_GC_STATE_SAFE)
         jl_gc_safepoint_(ptls);
     return old_state;
 }

diff --git a/src/llvm-codegen-shared.h b/src/llvm-codegen-shared.h
@@ -304,8 +304,8 @@ static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::T
     BasicBlock *passBB = BasicBlock::Create(builder.getContext(), "safepoint", builder.GetInsertBlock()->getParent());
     BasicBlock *exitBB = BasicBlock::Create(builder.getContext(), "after_safepoint", builder.GetInsertBlock()->getParent());
     Constant *zero8 = ConstantInt::get(T_int8, 0);
-    builder.CreateCondBr(builder.CreateAnd(builder.CreateICmpNE(old_state, zero8), // if (old_state && !state)
-                                           builder.CreateICmpEQ(state, zero8)),
+    builder.CreateCondBr(builder.CreateOr(builder.CreateICmpEQ(old_state, zero8), // if (!old_state || !state)
+                                          builder.CreateICmpEQ(state, zero8)),
                          passBB, exitBB);
     builder.SetInsertPoint(passBB);
     MDNode *tbaa = get_tbaa_const(builder.getContext());

diff --git a/src/rtutils.c b/src/rtutils.c
@@ -281,7 +281,6 @@ JL_DLLEXPORT void jl_eh_restore_state(jl_handler_t *eh)
     // This function should **NOT** have any safepoint before the ones at the
     // end.
     sig_atomic_t old_defer_signal = ct->ptls->defer_signal;
-    int8_t old_gc_state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
     ct->eh = eh->prev;
     ct->gcstack = eh->gcstack;
     small_arraylist_t *locks = &ct->ptls->locks;
@@ -293,9 +292,10 @@ JL_DLLEXPORT void jl_eh_restore_state(jl_handler_t *eh)
     }
     ct->world_age = eh->world_age;
     ct->ptls->defer_signal = eh->defer_signal;
+    int8_t old_gc_state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
     if (old_gc_state != eh->gc_state)
         jl_atomic_store_release(&ct->ptls->gc_state, eh->gc_state);
-    if (!eh->gc_state)
+    if (!old_gc_state || !eh->gc_state) // it was or is unsafe now
         jl_gc_safepoint_(ct->ptls);
     if (old_defer_signal && !eh->defer_signal)
         jl_sigint_safepoint(ct->ptls);

diff --git a/src/safepoint.c b/src/safepoint.c
@@ -44,7 +44,8 @@ uint16_t jl_safepoint_enable_cnt[4] = {0, 0, 0, 0};
 // load/store so that threads waiting for the GC doesn't have to also
 // fight on the safepoint lock...
 uv_mutex_t safepoint_lock;
-uv_cond_t safepoint_cond;
+uv_cond_t safepoint_cond_begin;
+uv_cond_t safepoint_cond_end;
 
 static void jl_safepoint_enable(int idx) JL_NOTSAFEPOINT
 {
@@ -91,7 +92,8 @@ static void jl_safepoint_disable(int idx) JL_NOTSAFEPOINT
 void jl_safepoint_init(void)
 {
     uv_mutex_init(&safepoint_lock);
-    uv_cond_init(&safepoint_cond);
+    uv_cond_init(&safepoint_cond_begin);
+    uv_cond_init(&safepoint_cond_end);
     // jl_page_size isn't available yet.
     size_t pgsz = jl_getpagesize();
 #ifdef _OS_WINDOWS_
@@ -124,12 +126,51 @@ void jl_safepoint_init(void)
     jl_safepoint_pages = addr;
 }
 
+void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
+{
+    JL_TIMING(GC, GC_Stop);
+#ifdef USE_TRACY
+    TracyCZoneCtx ctx = JL_TIMING_DEFAULT_BLOCK->tracy_ctx;
+    TracyCZoneColor(ctx, 0x696969);
+#endif
+    assert(gc_n_threads);
+    if (gc_n_threads > 1)
+        jl_wake_libuv();
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 != NULL) {
+            // This acquire load pairs with the release stores
+            // in the signal handler of safepoint so we are sure that
+            // all the stores on those threads are visible.
+            // We're currently also using atomic store release in mutator threads
+            // (in jl_gc_state_set), but we may want to use signals to flush the
+            // memory operations on those threads lazily instead.
+            while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) {
+                // Use system mutexes rather than spin locking to minimize wasted CPU time
+                // while we wait for other threads reach a safepoint.
+                // This is particularly important when run under rr.
+                uv_mutex_lock(&safepoint_lock);
+                if (!jl_atomic_load_relaxed(&ptls2->gc_state))
+                    uv_cond_wait(&safepoint_cond_begin, &safepoint_lock);
+                uv_mutex_unlock(&safepoint_lock);
+            }
+        }
+    }
+}
+
 int jl_safepoint_start_gc(void)
 {
-    // The thread should have set this already
+    // The thread should have just set this before entry
     assert(jl_atomic_load_relaxed(&jl_current_task->ptls->gc_state) == JL_GC_STATE_WAITING);
-    jl_safepoint_wait_thread_resume(); // make sure we are permitted to run GC now (we might be required to stop instead)
     uv_mutex_lock(&safepoint_lock);
+    uv_cond_broadcast(&safepoint_cond_begin);
+    // make sure we are permitted to run GC now (we might be required to stop instead)
+    jl_task_t *ct = jl_current_task;
+    while (jl_atomic_load_relaxed(&ct->ptls->suspend_count)) {
+        uv_mutex_unlock(&safepoint_lock);
+        jl_safepoint_wait_thread_resume();
+        uv_mutex_lock(&safepoint_lock);
+    }
     // In case multiple threads enter the GC at the same time, only allow
     // one of them to actually run the collection. We can't just let the
     // master thread do the GC since it might be running unmanaged code
@@ -169,7 +210,22 @@ void jl_safepoint_end_gc(void)
     jl_mach_gc_end();
 #  endif
     uv_mutex_unlock(&safepoint_lock);
-    uv_cond_broadcast(&safepoint_cond);
+    uv_cond_broadcast(&safepoint_cond_end);
+}
+
+void jl_set_gc_and_wait(void) // n.b. not used on _OS_DARWIN_
+{
+    jl_task_t *ct = jl_current_task;
+    // reading own gc state doesn't need atomic ops since no one else
+    // should store to it.
+    int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
+    jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING);
+    uv_mutex_lock(&safepoint_lock);
+    uv_cond_broadcast(&safepoint_cond_begin);
+    uv_mutex_unlock(&safepoint_lock);
+    jl_safepoint_wait_gc();
+    jl_atomic_store_release(&ct->ptls->gc_state, state);
+    jl_safepoint_wait_thread_resume(); // block in thread-suspend now if requested, after clearing the gc_state
 }
 
 // this is the core of jl_set_gc_and_wait
@@ -187,7 +243,7 @@ void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT
         // This is particularly important when run under rr.
         uv_mutex_lock(&safepoint_lock);
         if (jl_atomic_load_relaxed(&jl_gc_running))
-            uv_cond_wait(&safepoint_cond, &safepoint_lock);
+            uv_cond_wait(&safepoint_cond_end, &safepoint_lock);
         uv_mutex_unlock(&safepoint_lock);
     }
 }
@@ -204,6 +260,14 @@ void jl_safepoint_wait_thread_resume(void)
     int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
     jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING);
     uv_mutex_lock(&ct->ptls->sleep_lock);
+    if (jl_atomic_load_relaxed(&ct->ptls->suspend_count)) {
+        // defer this broadcast until we determine whether uv_cond_wait is really going to be needed
+        uv_mutex_unlock(&ct->ptls->sleep_lock);
+        uv_mutex_lock(&safepoint_lock);
+        uv_cond_broadcast(&safepoint_cond_begin);
+        uv_mutex_unlock(&safepoint_lock);
+        uv_mutex_lock(&ct->ptls->sleep_lock);
+    }
     while (jl_atomic_load_relaxed(&ct->ptls->suspend_count))
         uv_cond_wait(&ct->ptls->wake_signal, &ct->ptls->sleep_lock);
     // must while still holding the mutex_unlock, so we know other threads in
@@ -249,7 +313,7 @@ int jl_safepoint_suspend_thread(int tid, int waitstate)
                 break;
             if (waitstate == 3 && state2 == JL_GC_STATE_WAITING)
                 break;
-            jl_cpu_pause(); // yield?
+            jl_cpu_pause(); // yield (wait for safepoint_cond_begin, for example)?
         }
     }
     return suspend_count;
@@ -262,9 +326,7 @@ int jl_safepoint_resume_thread(int tid) JL_NOTSAFEPOINT
     if (0 > tid || tid >= jl_atomic_load_acquire(&jl_n_threads))
         return 0;
     jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
-#  ifdef _OS_DARWIN_
     uv_mutex_lock(&safepoint_lock);
-#  endif
     uv_mutex_lock(&ptls2->sleep_lock);
     int16_t suspend_count = jl_atomic_load_relaxed(&ptls2->suspend_count);
     if (suspend_count == 1) { // last to unsuspend
@@ -283,9 +345,7 @@ int jl_safepoint_resume_thread(int tid) JL_NOTSAFEPOINT
             jl_safepoint_disable(3);
     }
     uv_mutex_unlock(&ptls2->sleep_lock);
-#  ifdef _OS_DARWIN_
     uv_mutex_unlock(&safepoint_lock);
-#  endif
     return suspend_count;
 }
 

diff --git a/src/scheduler.c b/src/scheduler.c
@@ -32,7 +32,7 @@ static const int16_t sleeping_like_the_dead JL_UNUSED = 2;
 // a running count of how many threads are currently not_sleeping
 // plus a running count of the number of in-flight wake-ups
 // n.b. this may temporarily exceed jl_n_threads
-static _Atomic(int) nrunning = 1;
+static _Atomic(int) nrunning = 0;
 
 // invariant: No thread is ever asleep unless sleep_check_state is sleeping (or we have a wakeup signal pending).
 // invariant: Any particular thread is not asleep unless that thread's sleep_check_state is sleeping.
@@ -201,6 +201,20 @@ void jl_threadfun(void *arg)
 }
 
 
+
+void jl_init_thread_scheduler(jl_ptls_t ptls) JL_NOTSAFEPOINT
+{
+    uv_mutex_init(&ptls->sleep_lock);
+    uv_cond_init(&ptls->wake_signal);
+    // record that there is now another thread that may be used to schedule work
+    // we will decrement this again in scheduler_delete_thread, only slightly
+    // in advance of pthread_join (which hopefully itself also had been
+    // adopted by now and is included in nrunning too)
+    (void)jl_atomic_fetch_add_relaxed(&nrunning, 1);
+    // n.b. this is the only point in the code where we ignore the invariants on the ordering of nrunning
+    // since we are being initialized from foreign code, we could not necessarily have expected or predicted that to happen
+}
+
 int jl_running_under_rr(int recheck)
 {
 #ifdef _OS_LINUX_
@@ -581,9 +595,10 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
 
 void scheduler_delete_thread(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
-    if (jl_atomic_exchange_relaxed(&ptls->sleep_check_state, sleeping_like_the_dead) != sleeping) {
-        int wasrunning = jl_atomic_fetch_add_relaxed(&nrunning, -1);
-        if (wasrunning == 1) {
+    int notsleeping = jl_atomic_exchange_relaxed(&ptls->sleep_check_state, sleeping_like_the_dead) == not_sleeping;
+    jl_fence();
+    if (notsleeping) {
+        if (jl_atomic_load_relaxed(&nrunning) == 1) {
             jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
             // This was the last running thread, and there is no thread with !may_sleep
             // so make sure tid 0 is notified to check wait_empty
@@ -592,8 +607,11 @@ void scheduler_delete_thread(jl_ptls_t ptls) JL_NOTSAFEPOINT
             uv_mutex_unlock(&ptls2->sleep_lock);
         }
     }
-    jl_fence();
+    else {
+        jl_atomic_fetch_add_relaxed(&nrunning, 1);
+    }
     jl_wakeup_thread(0); // force thread 0 to see that we do not have the IO lock (and am dead)
+    jl_atomic_fetch_add_relaxed(&nrunning, -1);
 }
 
 #ifdef __cplusplus

diff --git a/src/signals-mach.c b/src/signals-mach.c
@@ -45,6 +45,7 @@ static void attach_exception_port(thread_port_t thread, int segv_only);
 // low 16 bits are the thread id, the next 8 bits are the original gc_state
 static arraylist_t suspended_threads;
 extern uv_mutex_t safepoint_lock;
+extern uv_cond_t safepoint_cond_begin;
 
 // see jl_safepoint_wait_thread_resume
 void jl_safepoint_resume_thread_mach(jl_ptls_t ptls2, int16_t tid2)
@@ -122,6 +123,7 @@ static void jl_mach_gc_wait(jl_ptls_t ptls2, mach_port_t thread, int16_t tid)
     }
     if (relaxed_suspend_count)
         uv_mutex_unlock(&ptls2->sleep_lock);
+    uv_cond_broadcast(&safepoint_cond_begin);
     uv_mutex_unlock(&safepoint_lock);
 }
 

diff --git a/src/threading.c b/src/threading.c
@@ -316,6 +316,7 @@ int jl_all_tls_states_size;
 static uv_cond_t cond;
 // concurrent reads are permitted, using the same pattern as mtsmall_arraylist
 // it is implemented separately because the API of direct jl_all_tls_states use is already widely prevalent
+void jl_init_thread_scheduler(jl_ptls_t ptls) JL_NOTSAFEPOINT;
 
 // return calling thread's ID
 JL_DLLEXPORT int16_t jl_threadid(void)
@@ -379,9 +380,7 @@ jl_ptls_t jl_init_threadtls(int16_t tid)
     ptls->bt_data = bt_data;
     small_arraylist_new(&ptls->locks, 0);
     jl_init_thread_heap(ptls);
-
-    uv_mutex_init(&ptls->sleep_lock);
-    uv_cond_init(&ptls->wake_signal);
+    jl_init_thread_scheduler(ptls);
 
     uv_mutex_lock(&tls_lock);
     if (tid == -1)