From bf0cd8cfd7ff2308cbb0191bf13be01e1e0a7a0c Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Tue, 12 Dec 2023 18:27:19 +0000
Subject: [PATCH] Reland "gc: avoid cpu stalls when starting"

This reverts commit 4801b6ce9a4d737ad1722ca620a7bc8087590f2e and adds
the safepoints needed to catch the unsafe->safe transition also.
---
 src/gc-debug.c      |  1 -
 src/gc.c            | 27 +-------------------
 src/julia_threads.h |  6 ++---
 src/safepoint.c     | 61 ++++++++++++++++++++++++++++++++++++++++++---
 src/signals-mach.c  |  4 +++
 5 files changed, 65 insertions(+), 34 deletions(-)

diff --git a/src/gc-debug.c b/src/gc-debug.c
index 87861a5d89b9c7..84db461ce2b0a2 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -83,7 +83,6 @@ void add_lostval_parent(jl_value_t *parent)
  innocent looking functions which allocate (and thus trigger marking) only on special cases.
 
  If you can't find it, you can try the following :
- - Ensure that should_timeout() is deterministic instead of clock based.
  - Once you have a completely deterministic program which crashes on gc_verify, the addresses
    should stay constant between different runs (with same binary, same environment ...).
    Do not forget to turn off ASLR (linux: echo 0 > /proc/sys/kernel/randomize_va_space).
diff --git a/src/gc.c b/src/gc.c
index 17975a075dea44..31e2da6ff997a2 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -222,32 +222,7 @@ NOINLINE uintptr_t gc_get_stack_ptr(void)
     return (uintptr_t)jl_get_frame_addr();
 }
 
-#define should_timeout() 0
-
-void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
-{
-    JL_TIMING(GC, GC_Stop);
-#ifdef USE_TRACY
-    TracyCZoneCtx ctx = JL_TIMING_DEFAULT_BLOCK->tracy_ctx;
-    TracyCZoneColor(ctx, 0x696969);
-#endif
-    assert(gc_n_threads);
-    if (gc_n_threads > 1)
-        jl_wake_libuv();
-    for (int i = 0; i < gc_n_threads; i++) {
-        jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 != NULL) {
-            // This acquire load pairs with the release stores
-            // in the signal handler of safepoint so we are sure that
-            // all the stores on those threads are visible.
-            // We're currently also using atomic store release in mutator threads
-            // (in jl_gc_state_set), but we may want to use signals to flush the
-            // memory operations on those threads lazily instead.
-            while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
-                jl_cpu_pause(); // yield?
-        }
-    }
-}
+void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads);
 
 // malloc wrappers, aligned allocation
 
diff --git a/src/julia_threads.h b/src/julia_threads.h
index af7df23447c012..292c11f61d60d0 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -347,9 +347,9 @@ STATIC_INLINE int8_t jl_gc_state_set(jl_ptls_t ptls, int8_t state,
                                      int8_t old_state)
 {
     jl_atomic_store_release(&ptls->gc_state, state);
-    // A safe point is required if we transition from GC-safe region to
-    // non GC-safe region.
-    if (old_state && !state)
+    if (state == JL_GC_STATE_SAFE && old_state == 0)
+        jl_gc_safepoint_(ptls);
+    if (state == 0 && old_state == JL_GC_STATE_SAFE)
         jl_gc_safepoint_(ptls);
     return old_state;
 }
diff --git a/src/safepoint.c b/src/safepoint.c
index 51dae64e84e721..80490ada4e9e4a 100644
--- a/src/safepoint.c
+++ b/src/safepoint.c
@@ -44,7 +44,8 @@ uint16_t jl_safepoint_enable_cnt[4] = {0, 0, 0, 0};
 // load/store so that threads waiting for the GC doesn't have to also
 // fight on the safepoint lock...
 uv_mutex_t safepoint_lock;
-uv_cond_t safepoint_cond;
+uv_cond_t safepoint_cond_begin;
+uv_cond_t safepoint_cond_end;
 
 static void jl_safepoint_enable(int idx) JL_NOTSAFEPOINT
 {
@@ -91,7 +92,8 @@ static void jl_safepoint_disable(int idx) JL_NOTSAFEPOINT
 void jl_safepoint_init(void)
 {
     uv_mutex_init(&safepoint_lock);
-    uv_cond_init(&safepoint_cond);
+    uv_cond_init(&safepoint_cond_begin);
+    uv_cond_init(&safepoint_cond_end);
     // jl_page_size isn't available yet.
     size_t pgsz = jl_getpagesize();
 #ifdef _OS_WINDOWS_
@@ -124,6 +126,56 @@ void jl_safepoint_init(void)
     jl_safepoint_pages = addr;
 }
 
+void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
+{
+    JL_TIMING(GC, GC_Stop);
+#ifdef USE_TRACY
+    TracyCZoneCtx ctx = JL_TIMING_DEFAULT_BLOCK->tracy_ctx;
+    TracyCZoneColor(ctx, 0x696969);
+#endif
+    assert(gc_n_threads);
+    if (gc_n_threads > 1)
+        jl_wake_libuv();
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 != NULL) {
+            // This acquire load pairs with the release stores
+            // in the signal handler of safepoint so we are sure that
+            // all the stores on those threads are visible.
+            // We're currently also using atomic store release in mutator threads
+            // (in jl_gc_state_set), but we may want to use signals to flush the
+            // memory operations on those threads lazily instead.
+            while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
+                jl_cpu_pause(); // yield?
+        }
+    }
+}
+
+void jl_gc_wait_for_the_world(void)
+{
+    assert(jl_n_threads);
+    if (jl_n_threads > 1)
+        jl_wake_libuv();
+    for (int i = 0; i < jl_n_threads; i++) {
+        jl_ptls_t ptls2 = jl_all_tls_states[i];
+        // This acquire load pairs with the release stores
+        // in the signal handler of safepoint so we are sure that
+        // all the stores on those threads are visible.
+        // We're currently also using atomic store release in mutator threads
+        // (in jl_gc_state_set), but we may want to use signals to flush the
+        // memory operations on those threads lazily instead.
+        while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) {
+            // Use system mutexes rather than spin locking to minimize wasted CPU time
+            // while we wait for other threads reach a safepoint.
+            // This is particularly important when run under rr.
+            uv_mutex_lock(&safepoint_lock);
+            if (!jl_atomic_load_relaxed(&ptls2->gc_state))
+                uv_cond_wait(&safepoint_cond_begin, &safepoint_lock);
+            uv_mutex_unlock(&safepoint_lock);
+        }
+    }
+}
+
 int jl_safepoint_start_gc(void)
 {
     // The thread should have set this already
@@ -169,7 +221,7 @@ void jl_safepoint_end_gc(void)
     jl_mach_gc_end();
 #  endif
     uv_mutex_unlock(&safepoint_lock);
-    uv_cond_broadcast(&safepoint_cond);
+    uv_cond_broadcast(&safepoint_cond_end);
 }
 
 // this is the core of jl_set_gc_and_wait
@@ -179,6 +231,7 @@ void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT
     JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct);
     // The thread should have set this is already
     assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) != 0);
+    uv_cond_broadcast(&safepoint_cond_begin);
     // Use normal volatile load in the loop for speed until GC finishes.
     // Then use an acquire load to make sure the GC result is visible on this thread.
     while (jl_atomic_load_relaxed(&jl_gc_running) || jl_atomic_load_acquire(&jl_gc_running)) {
@@ -187,7 +240,7 @@ void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT
         // This is particularly important when run under rr.
         uv_mutex_lock(&safepoint_lock);
         if (jl_atomic_load_relaxed(&jl_gc_running))
-            uv_cond_wait(&safepoint_cond, &safepoint_lock);
+            uv_cond_wait(&safepoint_cond_end, &safepoint_lock);
         uv_mutex_unlock(&safepoint_lock);
     }
 }
diff --git a/src/signals-mach.c b/src/signals-mach.c
index b7ab8ecaf8ef67..cce3af22b53f0e 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -48,6 +48,8 @@ extern uv_mutex_t safepoint_lock;
 
 // see jl_safepoint_wait_thread_resume
 void jl_safepoint_resume_thread_mach(jl_ptls_t ptls2, int16_t tid2)
+extern uv_cond_t safepoint_cond_begin;
+void jl_mach_gc_end(void)
 {
     // must be called with uv_mutex_lock(&safepoint_lock) and uv_mutex_lock(&ptls2->sleep_lock) held (in that order)
     for (size_t i = 0; i < suspended_threads.len; i++) {
@@ -123,6 +125,8 @@ static void jl_mach_gc_wait(jl_ptls_t ptls2, mach_port_t thread, int16_t tid)
     if (relaxed_suspend_count)
         uv_mutex_unlock(&ptls2->sleep_lock);
     uv_mutex_unlock(&safepoint_lock);
+    uv_cond_broadcast(&safepoint_cond_begin);
+    return 1;
 }
 
 static mach_port_t segv_port = 0;