diff --git a/base/deprecated.jl b/base/deprecated.jl
index 83ea85cddff7a..ae80e0c082dea 100644
--- a/base/deprecated.jl
+++ b/base/deprecated.jl
@@ -327,4 +327,6 @@ function setproperty!(ci::CodeInfo, s::Symbol, v)
     return setfield!(ci, s, convert(fieldtype(CodeInfo, s), v))
 end
 
+@eval Threads nthreads() = threadpoolsize()
+
 # END 1.9 deprecations
diff --git a/base/partr.jl b/base/partr.jl
index a4cfcb60fe520..c5bb6603d53af 100644
--- a/base/partr.jl
+++ b/base/partr.jl
@@ -2,7 +2,7 @@
 
 module Partr
 
-using ..Threads: SpinLock, nthreads, threadid
+using ..Threads: SpinLock, maxthreadid, threadid
 
 # a task minheap
 mutable struct taskheap
diff --git a/base/pcre.jl b/base/pcre.jl
index 963b3ee4726a2..c46f969839f34 100644
--- a/base/pcre.jl
+++ b/base/pcre.jl
@@ -28,7 +28,7 @@ THREAD_MATCH_CONTEXTS::Vector{Ptr{Cvoid}} = [C_NULL]
 PCRE_COMPILE_LOCK = nothing
 
 _tid() = Int(ccall(:jl_threadid, Int16, ())) + 1
-_nth() = Int(unsafe_load(cglobal(:jl_n_threads, Cint)))
+_mth() = Int(Core.Intrinsics.atomic_pointerref(cglobal(:jl_n_threads, Cint), :acquire))
 
 function get_local_match_context()
     tid = _tid()
@@ -40,7 +40,7 @@ function get_local_match_context()
         try
             ctxs = THREAD_MATCH_CONTEXTS
             if length(ctxs) < tid
-                global THREAD_MATCH_CONTEXTS = ctxs = copyto!(fill(C_NULL, _nth()), ctxs)
+                global THREAD_MATCH_CONTEXTS = ctxs = copyto!(fill(C_NULL, length(ctxs) + _mth()), ctxs)
             end
         finally
             unlock(l)
diff --git a/base/task.jl b/base/task.jl
index 5601fea70a112..0756836936306 100644
--- a/base/task.jl
+++ b/base/task.jl
@@ -733,7 +733,7 @@ function workqueue_for(tid::Int)
     @lock l begin
         qs = Workqueues
         if length(qs) < tid
-            nt = Threads.nthreads()
+            nt = Threads.maxthreadid()
             @assert tid <= nt
             global Workqueues = qs = copyto!(typeof(qs)(undef, length(qs) + nt - 1), qs)
         end
@@ -746,7 +746,7 @@ end
 
 function enq_work(t::Task)
     (t._state === task_state_runnable && t.queue === nothing) || error("schedule: Task not runnable")
-    if t.sticky || Threads.nthreads() == 1
+    if t.sticky || Threads.threadpoolsize() == 1
         tid = Threads.threadid(t)
         if tid == 0
             # Issue #41324
diff --git a/base/threadingconstructs.jl b/base/threadingconstructs.jl
index 0852fafe192ec..b27ebfdf657da 100644
--- a/base/threadingconstructs.jl
+++ b/base/threadingconstructs.jl
@@ -11,20 +11,25 @@ ID `1`.
 """
 threadid() = Int(ccall(:jl_threadid, Int16, ())+1)
 
+# lower bound on the largest threadid()
 """
-    Threads.nthreads([:default|:interactive]) -> Int
+    Threads.maxthreadid() -> Int
 
-Get the number of threads (across all thread pools or within the specified
-thread pool) available to Julia. The number of threads across all thread
-pools is the inclusive upper bound on [`threadid()`](@ref).
+Get a lower bound on the number of threads (across all thread pools or within the specified
+thread pool) available to the Julia process >= [`threadid()`](@ref), with atomic-acquire semantics.
+"""
+maxthreadid() = Int(Core.Intrinsics.atomic_pointerref(cglobal(:jl_n_threads, Cint), :acquire))
 
-See also: `BLAS.get_num_threads` and `BLAS.set_num_threads` in the
-[`LinearAlgebra`](@ref man-linalg) standard library, and `nprocs()` in the
-[`Distributed`](@ref man-distributed) standard library.
 """
-function nthreads end
+    Threads.nthreads(:default | :interactive) -> Int
 
-nthreads() = Int(unsafe_load(cglobal(:jl_n_threads, Cint)))
+Get the current number of threads within the specified thread pool. The threads in default
+have id numbers `1:nthreads(:default)`.
+
+See also `BLAS.get_num_threads` and `BLAS.set_num_threads` in the [`LinearAlgebra`](@ref
+man-linalg) standard library, and `nprocs()` in the [`Distributed`](@ref man-distributed)
+standard library and [`Threads.maxthreadid()`](@ref).
+"""
 function nthreads(pool::Symbol)
     if pool === :default
         tpid = Int8(0)
@@ -35,6 +40,7 @@ function nthreads(pool::Symbol)
     end
     return _nthreads_in_pool(tpid)
 end
+
 function _nthreads_in_pool(tpid::Int8)
     p = unsafe_load(cglobal(:jl_n_threads_per_pool, Ptr{Cint}))
     return Int(unsafe_load(p, tpid + 1))
@@ -57,10 +63,21 @@ Returns the number of threadpools currently configured.
 """
 nthreadpools() = Int(unsafe_load(cglobal(:jl_n_threadpools, Cint)))
 
+"""
+    Threads.threadpoolsize()
+
+Get the number of threads available to the Julia default worker thread-pool.
+
+See also: `BLAS.get_num_threads` and `BLAS.set_num_threads` in the
+[`LinearAlgebra`](@ref man-linalg) standard library, and `nprocs()` in the
+[`Distributed`](@ref man-distributed) standard library.
+"""
+threadpoolsize() = Threads._nthreads_in_pool(Int8(0))
+#threadpoolsize() = max(Int(JLOptions().nthreads), 1)
 
 function threading_run(fun, static)
     ccall(:jl_enter_threaded_region, Cvoid, ())
-    n = nthreads()
+    n = threadpoolsize()
     tasks = Vector{Task}(undef, n)
     for i = 1:n
         t = Task(() -> fun(i)) # pass in tid
@@ -93,7 +110,7 @@ function _threadsfor(iter, lbody, schedule)
                 tid = 1
                 len, rem = lenr, 0
             else
-                len, rem = divrem(lenr, nthreads())
+                len, rem = divrem(lenr, threadpoolsize())
             end
             # not enough iterations for all the threads?
             if len == 0
@@ -185,7 +202,7 @@ assumption may be removed in the future.
 This scheduling option is merely a hint to the underlying execution mechanism. However, a
 few properties can be expected. The number of `Task`s used by `:dynamic` scheduler is
 bounded by a small constant multiple of the number of available worker threads
-([`nthreads()`](@ref Threads.nthreads)). Each task processes contiguous regions of the
+([`Threads.threadpoolsize()`](@ref)). Each task processes contiguous regions of the
 iteration space. Thus, `@threads :dynamic for x in xs; f(x); end` is typically more
 efficient than `@sync for x in xs; @spawn f(x); end` if `length(xs)` is significantly
 larger than the number of the worker threads and the run-time of `f(x)` is relatively
@@ -222,7 +239,7 @@ julia> function busywait(seconds)
 
 julia> @time begin
             Threads.@spawn busywait(5)
-            Threads.@threads :static for i in 1:Threads.nthreads()
+            Threads.@threads :static for i in 1:Threads.threadpoolsize()
                 busywait(1)
             end
         end
@@ -230,7 +247,7 @@ julia> @time begin
 
 julia> @time begin
             Threads.@spawn busywait(5)
-            Threads.@threads :dynamic for i in 1:Threads.nthreads()
+            Threads.@threads :dynamic for i in 1:Threads.threadpoolsize()
                 busywait(1)
             end
         end
diff --git a/base/threads.jl b/base/threads.jl
index 2b68c7104ee5e..2d388cc4b9f77 100644
--- a/base/threads.jl
+++ b/base/threads.jl
@@ -11,25 +11,4 @@ include("threadingconstructs.jl")
 include("atomics.jl")
 include("locks-mt.jl")
 
-
-"""
-    resize_nthreads!(A, copyvalue=A[1])
-
-Resize the array `A` to length [`nthreads()`](@ref).   Any new
-elements that are allocated are initialized to `deepcopy(copyvalue)`,
-where `copyvalue` defaults to `A[1]`.
-
-This is typically used to allocate per-thread variables, and
-should be called in `__init__` if `A` is a global constant.
-"""
-function resize_nthreads!(A::AbstractVector, copyvalue=A[1])
-    nthr = nthreads()
-    nold = length(A)
-    resize!(A, nthr)
-    for i = nold+1:nthr
-        A[i] = deepcopy(copyvalue)
-    end
-    return A
-end
-
 end
diff --git a/base/threads_overloads.jl b/base/threads_overloads.jl
index 376c1af94f441..7241d3182901d 100644
--- a/base/threads_overloads.jl
+++ b/base/threads_overloads.jl
@@ -3,7 +3,7 @@
 """
     Threads.foreach(f, channel::Channel;
                     schedule::Threads.AbstractSchedule=Threads.FairSchedule(),
-                    ntasks=Threads.nthreads())
+                    ntasks=Base.threadpoolsize())
 
 Similar to `foreach(f, channel)`, but iteration over `channel` and calls to
 `f` are split across `ntasks` tasks spawned by `Threads.@spawn`. This function
@@ -40,7 +40,7 @@ collect(d) = [1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256
 """
 function Threads.foreach(f, channel::Channel;
                          schedule::Threads.AbstractSchedule=Threads.FairSchedule(),
-                         ntasks=Threads.nthreads())
+                         ntasks=Threads.threadpoolsize())
     apply = _apply_for_schedule(schedule)
     stop = Threads.Atomic{Bool}(false)
     @sync for _ in 1:ntasks
diff --git a/cli/loader_exe.c b/cli/loader_exe.c
index 07a0bddcd4b87..ee7d27a563e04 100644
--- a/cli/loader_exe.c
+++ b/cli/loader_exe.c
@@ -15,7 +15,7 @@ extern "C" {
 JULIA_DEFINE_FAST_TLS
 
 #ifdef _COMPILER_ASAN_ENABLED_
-JL_DLLEXPORT const char* __asan_default_options()
+JL_DLLEXPORT const char* __asan_default_options(void)
 {
     return "allow_user_segv_handler=1:detect_leaks=0";
     // FIXME: enable LSAN after fixing leaks & defining __lsan_default_suppressions(),
diff --git a/contrib/generate_precompile.jl b/contrib/generate_precompile.jl
index acd61be502465..264036f4fb3ae 100644
--- a/contrib/generate_precompile.jl
+++ b/contrib/generate_precompile.jl
@@ -1,7 +1,7 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-if Threads.nthreads() != 1
-    @warn "Running this file with multiple Julia threads may lead to a build error" Threads.nthreads()
+if Threads.maxthreadid() != 1
+    @warn "Running this file with multiple Julia threads may lead to a build error" Base.maxthreadid()
 end
 
 if Base.isempty(Base.ARGS) || Base.ARGS[1] !== "0"
@@ -340,7 +340,7 @@ function generate_precompile_statements()
                 # wait for the next prompt-like to appear
                 readuntil(output_copy, "\n")
                 strbuf = ""
-                while true
+                while !eof(output_copy)
                     strbuf *= String(readavailable(output_copy))
                     occursin(JULIA_PROMPT, strbuf) && break
                     occursin(PKG_PROMPT, strbuf) && break
diff --git a/doc/src/base/multi-threading.md b/doc/src/base/multi-threading.md
index 293857c1c6c65..4932aef4cc938 100644
--- a/doc/src/base/multi-threading.md
+++ b/doc/src/base/multi-threading.md
@@ -5,9 +5,11 @@ Base.Threads.@threads
 Base.Threads.foreach
 Base.Threads.@spawn
 Base.Threads.threadid
+Base.Threads.maxthreadid
 Base.Threads.nthreads
 Base.Threads.threadpool
 Base.Threads.nthreadpools
+Base.Threads.threadpoolsize
 ```
 
 See also [Multi-Threading](@ref man-multithreading).
diff --git a/doc/src/manual/embedding.md b/doc/src/manual/embedding.md
index 26904d9ccffcd..0430d8a7c1ffb 100644
--- a/doc/src/manual/embedding.md
+++ b/doc/src/manual/embedding.md
@@ -604,7 +604,7 @@ The second condition above implies that you can not safely call `jl_...()` funct
 void *func(void*)
 {
     // Wrong, jl_eval_string() called from thread that was not started by Julia
-    jl_eval_string("println(Threads.nthreads())");
+    jl_eval_string("println(Threads.threadid())");
     return NULL;
 }
 
@@ -630,7 +630,7 @@ void *func(void*)
     // Okay, all jl_...() calls from the same thread,
     // even though it is not the main application thread
     jl_init();
-    jl_eval_string("println(Threads.nthreads())");
+    jl_eval_string("println(Threads.threadid())");
     jl_atexit_hook(0);
     return NULL;
 }
@@ -670,7 +670,7 @@ int main()
     jl_eval_string("func(i) = ccall(:c_func, Float64, (Int32,), i)");
 
     // Call func() multiple times, using multiple threads to do so
-    jl_eval_string("println(Threads.nthreads())");
+    jl_eval_string("println(Base.threadpoolsize())");
     jl_eval_string("use(i) = println(\"[J $(Threads.threadid())] i = $(i) -> $(func(i))\")");
     jl_eval_string("Threads.@threads for i in 1:5 use(i) end");
 
diff --git a/doc/src/manual/multi-threading.md b/doc/src/manual/multi-threading.md
index b20d0e54f1087..a536d4a508c09 100644
--- a/doc/src/manual/multi-threading.md
+++ b/doc/src/manual/multi-threading.md
@@ -6,10 +6,10 @@ of Julia multi-threading features.
 ## Starting Julia with multiple threads
 
 By default, Julia starts up with a single thread of execution. This can be verified by using the
-command [`Threads.nthreads()`](@ref):
+command [`Threads.threadpoolsize()`](@ref):
 
 ```jldoctest
-julia> Threads.nthreads()
+julia> Threads.threadpoolsize()
 1
 ```
 
@@ -38,7 +38,7 @@ $ julia --threads 4
 Let's verify there are 4 threads at our disposal.
 
 ```julia-repl
-julia> Threads.nthreads()
+julia> Threads.threadpoolsize()
 4
 ```
 
@@ -267,7 +267,7 @@ avoid the race:
 ```julia-repl
 julia> using Base.Threads
 
-julia> nthreads()
+julia> Threads.threadpoolsize()
 4
 
 julia> acc = Ref(0)
diff --git a/src/ccall.cpp b/src/ccall.cpp
index 88c80b333b027..274a50a8fac76 100644
--- a/src/ccall.cpp
+++ b/src/ccall.cpp
@@ -1540,7 +1540,8 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
         assert(lrt == getVoidTy(ctx.builder.getContext()));
         assert(!isVa && !llvmcall && nccallargs == 0);
         JL_GC_POP();
-        emit_gc_safepoint(ctx);
+        ctx.builder.CreateCall(prepare_call(gcroot_flush_func));
+        emit_gc_safepoint(ctx.builder, get_current_ptls(ctx), ctx.tbaa().tbaa_const);
         return ghostValue(ctx, jl_nothing_type);
     }
     else if (is_libjulia_func("jl_get_ptls_states")) {
@@ -1643,7 +1644,8 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
         ctx.builder.SetInsertPoint(checkBB);
         ctx.builder.CreateLoad(
                 getSizeTy(ctx.builder.getContext()),
-                ctx.builder.CreateConstInBoundsGEP1_32(getSizeTy(ctx.builder.getContext()), get_current_signal_page(ctx), -1),
+                ctx.builder.CreateConstInBoundsGEP1_32(getSizeTy(ctx.builder.getContext()),
+                    get_current_signal_page_from_ptls(ctx.builder, get_current_ptls(ctx), ctx.tbaa().tbaa_const), -1),
                 true);
         ctx.builder.CreateBr(contBB);
         ctx.f->getBasicBlockList().push_back(contBB);
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index 2f1803ef91051..7a1da252e1ca0 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -8,7 +8,6 @@
 
 STATISTIC(EmittedPointerFromObjref, "Number of emitted pointer_from_objref calls");
 STATISTIC(EmittedPointerBitcast, "Number of emitted pointer bitcasts");
-STATISTIC(EmittedNthPtrAddr, "Number of emitted nth pointer address instructions");
 STATISTIC(EmittedTypeof, "Number of emitted typeof instructions");
 STATISTIC(EmittedErrors, "Number of emitted errors");
 STATISTIC(EmittedConditionalErrors, "Number of emitted conditional errors");
@@ -42,7 +41,6 @@ STATISTIC(EmittedCPointerChecks, "Number of C pointer checks emitted");
 STATISTIC(EmittedAllocObjs, "Number of object allocations emitted");
 STATISTIC(EmittedWriteBarriers, "Number of write barriers emitted");
 STATISTIC(EmittedNewStructs, "Number of new structs emitted");
-STATISTIC(EmittedSignalFences, "Number of signal fences emitted");
 STATISTIC(EmittedDeferSignal, "Number of deferred signals emitted");
 
 static Value *track_pjlvalue(jl_codectx_t &ctx, Value *V)
@@ -971,41 +969,20 @@ static void emit_memcpy(jl_codectx_t &ctx, Value *dst, MDNode *tbaa_dst, const j
     emit_memcpy_llvm(ctx, dst, tbaa_dst, data_pointer(ctx, src), src.tbaa, sz, align, is_volatile);
 }
 
-static Value *emit_nthptr_addr(jl_codectx_t &ctx, Value *v, ssize_t n, bool gctracked = true)
-{
-    ++EmittedNthPtrAddr;
-    return ctx.builder.CreateInBoundsGEP(
-            ctx.types().T_prjlvalue,
-            emit_bitcast(ctx, maybe_decay_tracked(ctx, v), ctx.types().T_pprjlvalue),
-            ConstantInt::get(getSizeTy(ctx.builder.getContext()), n));
-}
-
-static Value *emit_nthptr_addr(jl_codectx_t &ctx, Value *v, Value *idx)
+static LoadInst *emit_nthptr_recast(jl_codectx_t &ctx, Value *v, Value *idx, MDNode *tbaa, Type *type)
 {
-    ++EmittedNthPtrAddr;
-    return ctx.builder.CreateInBoundsGEP(
+    // p = (jl_value_t**)v; *(type*)&p[n]
+    Value *vptr = ctx.builder.CreateInBoundsGEP(
             ctx.types().T_prjlvalue,
             emit_bitcast(ctx, maybe_decay_tracked(ctx, v), ctx.types().T_pprjlvalue),
             idx);
+    LoadInst *load = ctx.builder.CreateLoad(type, emit_bitcast(ctx, vptr, PointerType::get(type, 0)));
+    tbaa_decorate(tbaa, load);
+    return load;
 }
 
-static LoadInst *emit_nthptr_recast(jl_codectx_t &ctx, Value *v, Value *idx, MDNode *tbaa, Type *type)
-{
-    // p = (jl_value_t**)v; *(type*)&p[n]
-    Value *vptr = emit_nthptr_addr(ctx, v, idx);
-    return cast<LoadInst>(tbaa_decorate(tbaa, ctx.builder.CreateLoad(type,
-        emit_bitcast(ctx, vptr, PointerType::get(type, 0)))));
-}
-
-static LoadInst *emit_nthptr_recast(jl_codectx_t &ctx, Value *v, ssize_t n, MDNode *tbaa, Type *type)
-{
-    // p = (jl_value_t**)v; *(type*)&p[n]
-    Value *vptr = emit_nthptr_addr(ctx, v, n);
-    return cast<LoadInst>(tbaa_decorate(tbaa, ctx.builder.CreateLoad(type,
-        emit_bitcast(ctx, vptr, PointerType::get(type, 0)))));
- }
-
 static Value *boxed(jl_codectx_t &ctx, const jl_cgval_t &v,  bool is_promotable=false);
+
 static Value *emit_typeof(jl_codectx_t &ctx, Value *v, bool maybenull);
 
 static jl_cgval_t emit_typeof(jl_codectx_t &ctx, const jl_cgval_t &p, bool maybenull)
@@ -1177,8 +1154,12 @@ static Value *emit_datatype_isprimitivetype(jl_codectx_t &ctx, Value *dt)
 
 static Value *emit_datatype_name(jl_codectx_t &ctx, Value *dt)
 {
-    Value *vptr = emit_nthptr_addr(ctx, dt, (ssize_t)(offsetof(jl_datatype_t, name) / sizeof(char*)));
-    return tbaa_decorate(ctx.tbaa().tbaa_const, ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, vptr, Align(sizeof(void*))));
+    unsigned n = offsetof(jl_datatype_t, name) / sizeof(char*);
+    Value *vptr = ctx.builder.CreateInBoundsGEP(
+            ctx.types().T_pjlvalue,
+            emit_bitcast(ctx, maybe_decay_tracked(ctx, dt), ctx.types().T_ppjlvalue),
+            ConstantInt::get(getSizeTy(ctx.builder.getContext()), n));
+    return tbaa_decorate(ctx.tbaa().tbaa_const, ctx.builder.CreateAlignedLoad(ctx.types().T_pjlvalue, vptr, Align(sizeof(void*))));
 }
 
 // --- generating various error checks ---
@@ -1490,8 +1471,8 @@ static std::pair<Value*, bool> emit_isa(jl_codectx_t &ctx, const jl_cgval_t &x,
         // so the isa test reduces to a comparison of the typename by pointer
         return std::make_pair(
                 ctx.builder.CreateICmpEQ(
-                    mark_callee_rooted(ctx, emit_datatype_name(ctx, emit_typeof_boxed(ctx, x))),
-                    mark_callee_rooted(ctx, literal_pointer_val(ctx, (jl_value_t*)dt->name))),
+                    emit_datatype_name(ctx, emit_typeof_boxed(ctx, x)),
+                    literal_pointer_val(ctx, (jl_value_t*)dt->name)),
                 false);
     }
     if (jl_is_uniontype(intersected_type) &&
@@ -3427,10 +3408,10 @@ static void emit_cpointercheck(jl_codectx_t &ctx, const jl_cgval_t &x, const std
     emit_typecheck(ctx, mark_julia_type(ctx, t, true, jl_any_type), (jl_value_t*)jl_datatype_type, msg);
 
     Value *istype =
-        ctx.builder.CreateICmpEQ(mark_callee_rooted(ctx, emit_datatype_name(ctx, t)),
-                                 mark_callee_rooted(ctx, literal_pointer_val(ctx, (jl_value_t*)jl_pointer_typename)));
-    BasicBlock *failBB = BasicBlock::Create(ctx.builder.getContext(),"fail",ctx.f);
-    BasicBlock *passBB = BasicBlock::Create(ctx.builder.getContext(),"pass");
+        ctx.builder.CreateICmpEQ(emit_datatype_name(ctx, t),
+                                 literal_pointer_val(ctx, (jl_value_t*)jl_pointer_typename));
+    BasicBlock *failBB = BasicBlock::Create(ctx.builder.getContext(), "fail", ctx.f);
+    BasicBlock *passBB = BasicBlock::Create(ctx.builder.getContext(), "pass");
     ctx.builder.CreateCondBr(istype, passBB, failBB);
     ctx.builder.SetInsertPoint(failBB);
 
@@ -3878,8 +3859,7 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg
 
 static void emit_signal_fence(jl_codectx_t &ctx)
 {
-    ++EmittedSignalFences;
-    ctx.builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SyncScope::SingleThread);
+    emit_signal_fence(ctx.builder);
 }
 
 static Value *emit_defer_signal(jl_codectx_t &ctx)
@@ -3892,69 +3872,6 @@ static Value *emit_defer_signal(jl_codectx_t &ctx)
     return ctx.builder.CreateInBoundsGEP(ctx.types().T_sigatomic, ptls, ArrayRef<Value*>(offset), "jl_defer_signal");
 }
 
-static void emit_gc_safepoint(jl_codectx_t &ctx)
-{
-    ctx.builder.CreateCall(prepare_call(gcroot_flush_func));
-    emit_signal_fence(ctx);
-    ctx.builder.CreateLoad(getSizeTy(ctx.builder.getContext()), get_current_signal_page(ctx), true);
-    emit_signal_fence(ctx);
-}
-
-static Value *emit_gc_state_set(jl_codectx_t &ctx, Value *state, Value *old_state)
-{
-    Type *T_int8 = state->getType();
-    Value *ptls = emit_bitcast(ctx, get_current_ptls(ctx), getInt8PtrTy(ctx.builder.getContext()));
-    Constant *offset = ConstantInt::getSigned(getInt32Ty(ctx.builder.getContext()), offsetof(jl_tls_states_t, gc_state));
-    Value *gc_state = ctx.builder.CreateInBoundsGEP(T_int8, ptls, ArrayRef<Value*>(offset), "gc_state");
-    if (old_state == nullptr) {
-        old_state = ctx.builder.CreateLoad(T_int8, gc_state);
-        cast<LoadInst>(old_state)->setOrdering(AtomicOrdering::Monotonic);
-    }
-    ctx.builder.CreateAlignedStore(state, gc_state, Align(sizeof(void*)))->setOrdering(AtomicOrdering::Release);
-    if (auto *C = dyn_cast<ConstantInt>(old_state))
-        if (C->isZero())
-            return old_state;
-    if (auto *C = dyn_cast<ConstantInt>(state))
-        if (!C->isZero())
-            return old_state;
-    BasicBlock *passBB = BasicBlock::Create(ctx.builder.getContext(), "safepoint", ctx.f);
-    BasicBlock *exitBB = BasicBlock::Create(ctx.builder.getContext(), "after_safepoint", ctx.f);
-    Constant *zero8 = ConstantInt::get(T_int8, 0);
-    ctx.builder.CreateCondBr(ctx.builder.CreateAnd(ctx.builder.CreateICmpNE(old_state, zero8), // if (old_state && !state)
-                                                   ctx.builder.CreateICmpEQ(state, zero8)),
-                             passBB, exitBB);
-    ctx.builder.SetInsertPoint(passBB);
-    emit_gc_safepoint(ctx);
-    ctx.builder.CreateBr(exitBB);
-    ctx.builder.SetInsertPoint(exitBB);
-    return old_state;
-}
-
-static Value *emit_gc_unsafe_enter(jl_codectx_t &ctx)
-{
-    Value *state = ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0);
-    return emit_gc_state_set(ctx, state, nullptr);
-}
-
-static Value *emit_gc_unsafe_leave(jl_codectx_t &ctx, Value *state)
-{
-    Value *old_state = ConstantInt::get(state->getType(), 0);
-    return emit_gc_state_set(ctx, state, old_state);
-}
-
-//static Value *emit_gc_safe_enter(jl_codectx_t &ctx)
-//{
-//    Value *state = ConstantInt::get(getInt8Ty(ctx.builder.getContext()), JL_GC_STATE_SAFE);
-//    return emit_gc_state_set(ctx, state, nullptr);
-//}
-//
-//static Value *emit_gc_safe_leave(jl_codectx_t &ctx, Value *state)
-//{
-//    Value *old_state = ConstantInt::get(state->getType(), JL_GC_STATE_SAFE);
-//    return emit_gc_state_set(ctx, state, old_state);
-//}
-
-
 
 #ifndef JL_NDEBUG
 static int compare_cgparams(const jl_cgparams_t *a, const jl_cgparams_t *b)
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 2f3974c3a5110..6f654e59d81b4 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -590,6 +590,11 @@ static const auto jlpgcstack_func = new JuliaFunction{
     nullptr,
 };
 
+static const auto jladoptthread_func = new JuliaFunction{
+    "julia.get_pgcstack_or_new",
+    jlpgcstack_func->_type,
+    jlpgcstack_func->_attrs,
+};
 
 
 // important functions
@@ -1482,11 +1487,9 @@ static Value *global_binding_pointer(jl_codectx_t &ctx, jl_module_t *m, jl_sym_t
 static jl_cgval_t emit_checked_var(jl_codectx_t &ctx, Value *bp, jl_sym_t *name, bool isvol, MDNode *tbaa);
 static jl_cgval_t emit_sparam(jl_codectx_t &ctx, size_t i);
 static Value *emit_condition(jl_codectx_t &ctx, const jl_cgval_t &condV, const std::string &msg);
-static void allocate_gc_frame(jl_codectx_t &ctx, BasicBlock *b0);
 static Value *get_current_task(jl_codectx_t &ctx);
 static Value *get_current_ptls(jl_codectx_t &ctx);
 static Value *get_last_age_field(jl_codectx_t &ctx);
-static Value *get_current_signal_page(jl_codectx_t &ctx);
 static void CreateTrap(IRBuilder<> &irbuilder, bool create_new_block = true);
 static CallInst *emit_jlcall(jl_codectx_t &ctx, Function *theFptr, Value *theF,
                              const jl_cgval_t *args, size_t nargs, JuliaFunction *trampoline);
@@ -5327,21 +5330,17 @@ JL_GCC_IGNORE_STOP
 // --- generate function bodies ---
 
 // gc frame emission
-static void allocate_gc_frame(jl_codectx_t &ctx, BasicBlock *b0)
+static void allocate_gc_frame(jl_codectx_t &ctx, BasicBlock *b0, bool or_new=false)
 {
     // allocate a placeholder gc instruction
     // this will require the runtime, but it gets deleted later if unused
-    ctx.topalloca = ctx.builder.CreateCall(prepare_call(jlpgcstack_func));
+    ctx.topalloca = ctx.builder.CreateCall(prepare_call(or_new ? jladoptthread_func : jlpgcstack_func));
     ctx.pgcstack = ctx.topalloca;
 }
 
 static Value *get_current_task(jl_codectx_t &ctx)
 {
-    const int ptls_offset = offsetof(jl_task_t, gcstack);
-    return ctx.builder.CreateInBoundsGEP(
-        ctx.types().T_pjlvalue, emit_bitcast(ctx, ctx.pgcstack, ctx.types().T_ppjlvalue),
-        ConstantInt::get(getSizeTy(ctx.builder.getContext()), -(ptls_offset / sizeof(void *))),
-        "current_task");
+    return get_current_task_from_pgcstack(ctx.builder, ctx.pgcstack);
 }
 
 // Get PTLS through current task.
@@ -5361,15 +5360,6 @@ static Value *get_last_age_field(jl_codectx_t &ctx)
             "world_age");
 }
 
-// Get signal page through current task.
-static Value *get_current_signal_page(jl_codectx_t &ctx)
-{
-    // return ctx.builder.CreateCall(prepare_call(reuse_signal_page_func));
-    Value *ptls = get_current_ptls(ctx);
-    int nthfield = offsetof(jl_tls_states_t, safepoint) / sizeof(void *);
-    return emit_nthptr_recast(ctx, ptls, nthfield, ctx.tbaa().tbaa_const, getSizePtrTy(ctx.builder.getContext()));
-}
-
 static Function *emit_tojlinvoke(jl_code_instance_t *codeinst, Module *M, jl_codegen_params_t &params)
 {
     ++EmittedToJLInvokes;
@@ -5654,19 +5644,11 @@ static Function* gen_cfun_wrapper(
     ctx.builder.SetInsertPoint(b0);
     DebugLoc noDbg;
     ctx.builder.SetCurrentDebugLocation(noDbg);
-    allocate_gc_frame(ctx, b0);
+    allocate_gc_frame(ctx, b0, true);
 
-    Value *dummy_world = ctx.builder.CreateAlloca(getSizeTy(ctx.builder.getContext()));
-    Value *have_tls = ctx.builder.CreateIsNotNull(ctx.pgcstack);
-    // TODO: in the future, initialize a full TLS context here
     Value *world_age_field = get_last_age_field(ctx);
-    world_age_field = ctx.builder.CreateSelect(have_tls, world_age_field, dummy_world);
     Value *last_age = tbaa_decorate(ctx.tbaa().tbaa_gcframe,
             ctx.builder.CreateAlignedLoad(getSizeTy(ctx.builder.getContext()), world_age_field, Align(sizeof(size_t))));
-    Value *last_gc_state = ConstantInt::get(getInt8Ty(ctx.builder.getContext()), JL_GC_STATE_SAFE);
-    last_gc_state = emit_guarded_test(ctx, have_tls, last_gc_state, [&] {
-        return emit_gc_unsafe_enter(ctx);
-    });
 
     Value *world_v = ctx.builder.CreateAlignedLoad(getSizeTy(ctx.builder.getContext()),
         prepare_global_in(jl_Module, jlgetworld_global), Align(sizeof(size_t)));
@@ -5681,12 +5663,7 @@ static Function* gen_cfun_wrapper(
                     emit_bitcast(ctx, literal_pointer_val(ctx, (jl_value_t*)codeinst), getSizePtrTy(ctx.builder.getContext())),
                     offsetof(jl_code_instance_t, max_world) / sizeof(size_t)),
                 Align(sizeof(size_t)));
-        // XXX: age is always OK if we don't have a TLS. This is a hack required due to `@threadcall` abuse.
-        // and adds quite a bit of complexity here, even though it's still wrong
-        // (anything that tries to interact with the runtime will fault)
         age_ok = ctx.builder.CreateICmpUGE(lam_max, world_v);
-        world_v = ctx.builder.CreateSelect(ctx.builder.CreateOr(have_tls, age_ok), world_v, lam_max);
-        age_ok = ctx.builder.CreateOr(ctx.builder.CreateNot(have_tls), age_ok);
     }
     ctx.builder.CreateStore(world_v, world_age_field);
 
@@ -6043,12 +6020,6 @@ static Function* gen_cfun_wrapper(
     }
 
     ctx.builder.CreateStore(last_age, world_age_field);
-    if (!sig.retboxed) {
-        emit_guarded_test(ctx, have_tls, nullptr, [&] {
-            emit_gc_unsafe_leave(ctx, last_gc_state);
-            return nullptr;
-        });
-    }
     ctx.builder.CreateRet(r);
 
     ctx.builder.SetCurrentDebugLocation(noDbg);
@@ -8490,6 +8461,7 @@ static void init_jit_functions(void)
     add_named_global(jl_write_barrier_func, (void*)NULL);
     add_named_global(jl_write_barrier_binding_func, (void*)NULL);
     add_named_global(jldlsym_func, &jl_load_and_lookup);
+    add_named_global("jl_adopt_thread", &jl_adopt_thread);
     add_named_global(jlgetcfunctiontrampoline_func, &jl_get_cfunction_trampoline);
     add_named_global(jlgetnthfieldchecked_func, &jl_get_nth_field_checked);
     add_named_global(diff_gc_total_bytes_func, &jl_gc_diff_total_bytes);
diff --git a/src/codegen_shared.h b/src/codegen_shared.h
index 0e68668378f4e..329cc567e8c5f 100644
--- a/src/codegen_shared.h
+++ b/src/codegen_shared.h
@@ -22,6 +22,7 @@ enum AddressSpace {
 };
 
 static inline auto getSizeTy(llvm::LLVMContext &ctxt) {
+    //return M.getDataLayout().getIntPtrType(M.getContext());
     if (sizeof(size_t) > sizeof(uint32_t)) {
         return llvm::Type::getInt64Ty(ctxt);
     } else {
@@ -176,26 +177,127 @@ static inline llvm::Value *emit_bitcast_with_builder(llvm::IRBuilder<> &builder,
     }
 }
 
+// Get PTLS through current task.
+static inline llvm::Value *get_current_task_from_pgcstack(llvm::IRBuilder<> &builder, llvm::Value *pgcstack)
+{
+    using namespace llvm;
+    auto T_ppjlvalue = JuliaType::get_ppjlvalue_ty(builder.getContext());
+    auto T_pjlvalue = JuliaType::get_pjlvalue_ty(builder.getContext());
+    const int pgcstack_offset = offsetof(jl_task_t, gcstack);
+    return builder.CreateInBoundsGEP(
+            T_pjlvalue, emit_bitcast_with_builder(builder, pgcstack, T_ppjlvalue),
+            ConstantInt::get(getSizeTy(builder.getContext()), -(pgcstack_offset / sizeof(void *))),
+            "current_task");
+}
+
 // Get PTLS through current task.
 static inline llvm::Value *get_current_ptls_from_task(llvm::IRBuilder<> &builder, llvm::Value *current_task, llvm::MDNode *tbaa)
 {
     using namespace llvm;
     auto T_ppjlvalue = JuliaType::get_ppjlvalue_ty(builder.getContext());
     auto T_pjlvalue = JuliaType::get_pjlvalue_ty(builder.getContext());
-    auto T_size = builder.GetInsertBlock()->getModule()->getDataLayout().getIntPtrType(builder.getContext());
+    auto T_size = getSizeTy(builder.getContext());
     const int ptls_offset = offsetof(jl_task_t, ptls);
     llvm::Value *pptls = builder.CreateInBoundsGEP(
-        T_pjlvalue, current_task,
-        ConstantInt::get(T_size, ptls_offset / sizeof(void *)),
-        "ptls_field");
+            T_pjlvalue, current_task,
+            ConstantInt::get(T_size, ptls_offset / sizeof(void *)),
+            "ptls_field");
     LoadInst *ptls_load = builder.CreateAlignedLoad(T_pjlvalue,
-        emit_bitcast_with_builder(builder, pptls, T_ppjlvalue), Align(sizeof(void *)), "ptls_load");
+            emit_bitcast_with_builder(builder, pptls, T_ppjlvalue), Align(sizeof(void *)), "ptls_load");
     // Note: Corresponding store (`t->ptls = ptls`) happens in `ctx_switch` of tasks.c.
     tbaa_decorate(tbaa, ptls_load);
-    // Using `CastInst::Create` to get an `Instruction*` without explicit cast:
-    auto ptls = CastInst::Create(Instruction::BitCast, ptls_load, T_ppjlvalue, "ptls");
-    builder.Insert(ptls);
-    return ptls;
+    return builder.CreateBitCast(ptls_load, T_ppjlvalue, "ptls");
+}
+
+// Get signal page through current task.
+static inline llvm::Value *get_current_signal_page_from_ptls(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::MDNode *tbaa)
+{
+    using namespace llvm;
+    // return builder.CreateCall(prepare_call(reuse_signal_page_func));
+    auto T_size = getSizeTy(builder.getContext());
+    auto T_psize = T_size->getPointerTo();
+    auto T_ppsize = T_psize->getPointerTo();
+    int nthfield = offsetof(jl_tls_states_t, safepoint) / sizeof(void *);
+    ptls = emit_bitcast_with_builder(builder, ptls, T_ppsize);
+    llvm::Value *psafepoint = builder.CreateInBoundsGEP(
+            T_psize, ptls, ConstantInt::get(T_size, nthfield));
+    LoadInst *ptls_load = builder.CreateAlignedLoad(
+            T_psize, psafepoint, Align(sizeof(void *)), "safepoint");
+    tbaa_decorate(tbaa, ptls_load);
+    return ptls_load;
+}
+
+static inline void emit_signal_fence(llvm::IRBuilder<> &builder)
+{
+    using namespace llvm;
+    builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SyncScope::SingleThread);
+}
+
+static inline void emit_gc_safepoint(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::MDNode *tbaa)
+{
+    emit_signal_fence(builder);
+    builder.CreateLoad(getSizeTy(builder.getContext()), get_current_signal_page_from_ptls(builder, ptls, tbaa), true);
+    emit_signal_fence(builder);
+}
+
+static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state, llvm::Value *old_state)
+{
+    using namespace llvm;
+    Type *T_int8 = state->getType();
+    ptls = emit_bitcast_with_builder(builder, ptls, builder.getInt8PtrTy());
+    Constant *offset = ConstantInt::getSigned(builder.getInt32Ty(), offsetof(jl_tls_states_t, gc_state));
+    Value *gc_state = builder.CreateInBoundsGEP(T_int8, ptls, ArrayRef<Value*>(offset), "gc_state");
+    if (old_state == nullptr) {
+        old_state = builder.CreateLoad(T_int8, gc_state);
+        cast<LoadInst>(old_state)->setOrdering(AtomicOrdering::Monotonic);
+    }
+    builder.CreateAlignedStore(state, gc_state, Align(sizeof(void*)))->setOrdering(AtomicOrdering::Release);
+    if (auto *C = dyn_cast<ConstantInt>(old_state))
+        if (C->isZero())
+            return old_state;
+    if (auto *C = dyn_cast<ConstantInt>(state))
+        if (!C->isZero())
+            return old_state;
+    BasicBlock *passBB = BasicBlock::Create(builder.getContext(), "safepoint", builder.GetInsertBlock()->getParent());
+    BasicBlock *exitBB = BasicBlock::Create(builder.getContext(), "after_safepoint", builder.GetInsertBlock()->getParent());
+    Constant *zero8 = ConstantInt::get(T_int8, 0);
+    builder.CreateCondBr(builder.CreateAnd(builder.CreateICmpNE(old_state, zero8), // if (old_state && !state)
+                                           builder.CreateICmpEQ(state, zero8)),
+                         passBB, exitBB);
+    builder.SetInsertPoint(passBB);
+    MDNode *tbaa = get_tbaa_const(builder.getContext());
+    emit_gc_safepoint(builder, ptls, tbaa);
+    builder.CreateBr(exitBB);
+    builder.SetInsertPoint(exitBB);
+    return old_state;
+}
+
+static inline llvm::Value *emit_gc_unsafe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls)
+{
+    using namespace llvm;
+    Value *state = builder.getInt8(0);
+    return emit_gc_state_set(builder, ptls, state, nullptr);
+}
+
+static inline llvm::Value *emit_gc_unsafe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state)
+{
+    using namespace llvm;
+    Value *old_state = builder.getInt8(0);
+    return emit_gc_state_set(builder, ptls, state, old_state);
+}
+
+static inline llvm::Value *emit_gc_safe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls)
+{
+    using namespace llvm;
+    Value *state = builder.getInt8(JL_GC_STATE_SAFE);
+    return emit_gc_state_set(builder, ptls, state, nullptr);
+}
+
+static inline llvm::Value *emit_gc_safe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state)
+{
+    using namespace llvm;
+    Value *old_state = builder.getInt8(JL_GC_STATE_SAFE);
+    return emit_gc_state_set(builder, ptls, state, old_state);
 }
 
 // Compatibility shims for LLVM attribute APIs that were renamed in LLVM 14.
@@ -327,5 +429,4 @@ inline Attribute getAttributeAtIndex(const AttributeList &L, unsigned Index, Att
     return L.getAttribute(Index, Kind);
 #endif
 }
-
 }
diff --git a/src/flisp/flmain.c b/src/flisp/flmain.c
index f3861eed9e8a2..68e9d87a26d77 100644
--- a/src/flisp/flmain.c
+++ b/src/flisp/flmain.c
@@ -10,7 +10,7 @@ extern "C" {
 
 #if defined(__has_feature)
 #if __has_feature(address_sanitizer)
-const char* __asan_default_options() {
+const char* __asan_default_options(void) {
     return "detect_leaks=0";
 }
 #endif
diff --git a/src/gc-alloc-profiler.cpp b/src/gc-alloc-profiler.cpp
index 818d6e803c9df..1bcbeb2189f5f 100644
--- a/src/gc-alloc-profiler.cpp
+++ b/src/gc-alloc-profiler.cpp
@@ -80,7 +80,8 @@ extern "C" {  // Needed since these functions doesn't take any arguments.
 
 JL_DLLEXPORT void jl_start_alloc_profile(double sample_rate) {
     // We only need to do this once, the first time this is called.
-    while (g_alloc_profile.per_thread_profiles.size() < (size_t)jl_n_threads) {
+    size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    while (g_alloc_profile.per_thread_profiles.size() < nthreads) {
         g_alloc_profile.per_thread_profiles.push_back(jl_per_thread_alloc_profile_t{});
     }
 
@@ -131,7 +132,10 @@ JL_DLLEXPORT void jl_free_alloc_profile() {
 
 void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *type) JL_NOTSAFEPOINT {
     auto& global_profile = g_alloc_profile;
-    auto thread_id = jl_atomic_load_relaxed(&jl_current_task->tid);
+    size_t thread_id = jl_atomic_load_relaxed(&jl_current_task->tid);
+    if (thread_id >= global_profile.per_thread_profiles.size())
+        return; // ignore allocations on threads started after the alloc-profile started
+
     auto& profile = global_profile.per_thread_profiles[thread_id];
 
     auto sample_val = double(rand()) / double(RAND_MAX);
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 7d6ca8ece2ecf..dae634bd9b8fd 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -99,7 +99,7 @@ static arraylist_t bits_save[4];
 
 static void gc_clear_mark_page(jl_gc_pagemeta_t *pg, int bits)
 {
-    jl_ptls_t ptls2 = jl_all_tls_states[pg->thread_n];
+    jl_ptls_t ptls2 = gc_all_tls_states[pg->thread_n];
     jl_gc_pool_t *pool = &ptls2->heap.norm_pools[pg->pool_n];
     jl_taggedvalue_t *pv = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET);
     char *lim = (char*)pv + GC_PAGE_SZ - GC_PAGE_OFFSET - pool->osize;
@@ -164,8 +164,8 @@ static void clear_mark(int bits)
         }
     }
     bigval_t *v;
-    for (int i = 0;i < jl_n_threads;i++) {
-        v = jl_all_tls_states[i]->heap.big_objects;
+    for (int i = 0; i < gc_n_threads; i++) {
+        v = gc_all_tls_states[i]->heap.big_objects;
         while (v != NULL) {
             void *gcv = &v->header;
             if (!gc_verifying)
@@ -207,8 +207,8 @@ static void gc_verify_track(jl_ptls_t ptls)
         clear_mark(GC_CLEAN);
         gc_mark_queue_all_roots(ptls, &sp);
         gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0);
-        for (int i = 0;i < jl_n_threads;i++) {
-            jl_ptls_t ptls2 = jl_all_tls_states[i];
+        for (int i = 0; i < gc_n_threads; i++) {
+            jl_ptls_t ptls2 = gc_all_tls_states[i];
             gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0);
         }
         gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0);
@@ -256,8 +256,8 @@ void gc_verify(jl_ptls_t ptls)
     gc_verifying = 1;
     gc_mark_queue_all_roots(ptls, &sp);
     gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0);
-    for (int i = 0;i < jl_n_threads;i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[i];
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
         gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0);
     }
     gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0);
@@ -297,7 +297,7 @@ static void gc_verify_tags_page(jl_gc_pagemeta_t *pg)
     // for all pages in use
     int p_n = pg->pool_n;
     int t_n = pg->thread_n;
-    jl_ptls_t ptls2 = jl_all_tls_states[t_n];
+    jl_ptls_t ptls2 = gc_all_tls_states[t_n];
     jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n];
     int osize = pg->osize;
     char *data = pg->data;
@@ -401,8 +401,8 @@ static void gc_verify_tags_pagetable(void)
 void gc_verify_tags(void)
 {
     // verify the freelist chains look valid
-    for (int t_i = 0; t_i < jl_n_threads; t_i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[t_i];
+    for (int t_i = 0; t_i < gc_n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
         for (int i = 0; i < JL_GC_N_POOLS; i++) {
             // for all pools, iterate its freelist
             jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
@@ -467,7 +467,7 @@ static void gc_debug_alloc_init(jl_alloc_num_t *num, const char *name)
         return;
     if (*env == 'r') {
         env++;
-        for (int i = 0;i < 3;i++) {
+        for (int i = 0; i < 3; i++) {
             while (num->random[i] == 0) {
                 num->random[i] = jl_rand();
             }
@@ -577,7 +577,7 @@ static void gc_scrub_task(jl_task_t *ta)
     jl_ptls_t ptls = jl_current_task->ptls;
     jl_ptls_t ptls2 = NULL;
     if (tid != -1)
-        ptls2 = jl_all_tls_states[tid];
+        ptls2 = gc_all_tls_states[tid];
 
     char *low;
     char *high;
@@ -946,8 +946,8 @@ void gc_time_mark_pause(int64_t t0, int64_t scanned_bytes,
 {
     int64_t last_remset_len = 0;
     int64_t remset_nptr = 0;
-    for (int t_i = 0;t_i < jl_n_threads;t_i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[t_i];
+    for (int t_i = 0; t_i < gc_n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
         last_remset_len += ptls2->heap.last_remset->len;
         remset_nptr = ptls2->heap.remset_nptr;
     }
@@ -1023,7 +1023,7 @@ void jl_gc_debug_init(void)
 #endif
 
 #ifdef OBJPROFILE
-    for (int g = 0;g < 3;g++) {
+    for (int g = 0; g < 3; g++) {
         htable_new(&obj_counts[g], 0);
         htable_new(&obj_sizes[g], 0);
     }
@@ -1085,8 +1085,8 @@ void gc_stats_all_pool(void)
 {
     size_t nb=0, w, tw=0, no=0, tp=0, nold=0, noldbytes=0, np, nol;
     for (int i = 0; i < JL_GC_N_POOLS; i++) {
-        for (int t_i = 0; t_i < jl_n_threads; t_i++) {
-            jl_ptls_t ptls2 = jl_all_tls_states[t_i];
+        for (int t_i = 0; t_i < gc_n_threads; t_i++) {
+            jl_ptls_t ptls2 = gc_all_tls_states[t_i];
             size_t b = pool_stats(&ptls2->heap.norm_pools[i], &w, &np, &nol);
             nb += b;
             no += (b / ptls2->heap.norm_pools[i].osize);
@@ -1110,8 +1110,8 @@ void gc_stats_all_pool(void)
 void gc_stats_big_obj(void)
 {
     size_t nused=0, nbytes=0, nused_old=0, nbytes_old=0;
-    for (int t_i = 0; t_i < jl_n_threads; t_i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[t_i];
+    for (int t_i = 0; t_i < gc_n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
         bigval_t *v = ptls2->heap.big_objects;
         while (v != NULL) {
             if (gc_marked(v->bits.gc)) {
@@ -1219,7 +1219,7 @@ void gc_count_pool(void)
     empty_pages = 0;
     gc_count_pool_pagetable();
     jl_safe_printf("****** Pool stat: ******\n");
-    for (int i = 0;i < 4;i++)
+    for (int i = 0; i < 4; i++)
         jl_safe_printf("bits(%d): %"  PRId64 "\n", i, poolobj_sizes[i]);
     // empty_pages is inaccurate after the sweep since young objects are
     // also GC_CLEAN
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index b7adf254026ca..32f5058c4e3ce 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -188,8 +188,9 @@ void sweep_stack_pools(void)
     //            bufsz = t->bufsz
     //            if (stkbuf)
     //                push(free_stacks[sz], stkbuf)
-    for (int i = 0; i < jl_n_threads; i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[i];
+    assert(gc_n_threads);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
 
         // free half of stacks that remain unused since last sweep
         for (int p = 0; p < JL_N_STACK_POOLS; p++) {
diff --git a/src/gc.c b/src/gc.c
index c45ff8206ca67..9984b07435111 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -168,6 +168,8 @@ static _Atomic(int) support_conservative_marking = 0;
 
 jl_gc_num_t gc_num = {0};
 static size_t last_long_collect_interval;
+int gc_n_threads;
+jl_ptls_t* gc_all_tls_states;
 
 pagetable_t memory_map;
 
@@ -190,12 +192,15 @@ NOINLINE uintptr_t gc_get_stack_ptr(void)
 
 #define should_timeout() 0
 
-static void jl_gc_wait_for_the_world(void)
+void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
 {
-    if (jl_n_threads > 1)
+    assert(gc_n_threads);
+    if (gc_n_threads > 1)
         jl_wake_libuv();
-    for (int i = 0; i < jl_n_threads; i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[i];
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 == NULL)
+            continue;
         // This acquire load pairs with the release stores
         // in the signal handler of safepoint so we are sure that
         // all the stores on those threads are visible.
@@ -207,6 +212,9 @@ static void jl_gc_wait_for_the_world(void)
     }
 }
 
+
+void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads);
+
 // malloc wrappers, aligned allocation
 
 #if defined(_OS_WINDOWS_)
@@ -269,8 +277,8 @@ static void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT
 
 static void run_finalizer(jl_task_t *ct, jl_value_t *o, jl_value_t *ff)
 {
-    if (gc_ptr_tag(o, 1)) {
-        ((void (*)(void*))ff)(gc_ptr_clear_tag(o, 1));
+    if (gc_ptr_tag(o, 3)) {
+        ((void (*)(void*))ff)(gc_ptr_clear_tag(o, 3));
         return;
     }
     jl_value_t *args[2] = {ff,o};
@@ -411,7 +419,10 @@ static void run_finalizers(jl_task_t *ct)
     jl_rng_split(ct->rngState, finalizer_rngState);
 
     // This releases the finalizers lock.
+    int8_t was_in_finalizer = ct->ptls->in_finalizer;
+    ct->ptls->in_finalizer = 1;
     jl_gc_run_finalizers_in_list(ct, &copied_list);
+    ct->ptls->in_finalizer = was_in_finalizer;
     arraylist_free(&copied_list);
 
     memcpy(&ct->rngState[0], &save_rngState[0], sizeof(save_rngState));
@@ -423,9 +434,7 @@ JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct)
         ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
     if (!ptls->in_finalizer && ptls->locks.len == 0 && ptls->finalizers_inhibited == 0) {
-        ptls->in_finalizer = 1;
         run_finalizers(ct);
-        ptls->in_finalizer = 0;
     }
 }
 
@@ -496,13 +505,18 @@ static void schedule_all_finalizers(arraylist_t *flist) JL_NOTSAFEPOINT
 
 void jl_gc_run_all_finalizers(jl_task_t *ct)
 {
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
     schedule_all_finalizers(&finalizer_list_marked);
-    // This could be run before we had a chance to setup all threads
-    for (int i = 0;i < jl_n_threads;i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[i];
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
         if (ptls2)
             schedule_all_finalizers(&ptls2->finalizers);
     }
+    gc_n_threads = 0;
+    gc_all_tls_states = NULL;
     run_finalizers(ct);
 }
 
@@ -539,6 +553,13 @@ JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f
     jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 1), f);
 }
 
+// schedule f(v) to call at the next quiescent interval (aka after the next safepoint/region on all threads)
+JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT
+{
+    assert(!gc_ptr_tag(v, 3));
+    jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 3), f);
+}
+
 JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT
 {
     if (__unlikely(jl_typeis(f, jl_voidpointer_type))) {
@@ -559,11 +580,18 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o)
     arraylist_new(&copied_list, 0);
     // No need to check the to_finalize list since the user is apparently
     // still holding a reference to the object
-    for (int i = 0; i < jl_n_threads; i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[i];
-        finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i);
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2)
+            finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i);
     }
     finalize_object(&finalizer_list_marked, o, &copied_list, 0);
+    gc_n_threads = 0;
+    gc_all_tls_states = NULL;
     if (copied_list.len > 0) {
         // This releases the finalizers lock.
         jl_gc_run_finalizers_in_list(ct, &copied_list);
@@ -595,9 +623,11 @@ static void gc_sweep_foreign_objs_in_list(arraylist_t *objs)
 
 static void gc_sweep_foreign_objs(void)
 {
-    for (int i = 0;i < jl_n_threads; i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[i];
-        gc_sweep_foreign_objs_in_list(&ptls2->sweep_objs);
+    assert(gc_n_threads);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2)
+            gc_sweep_foreign_objs_in_list(&ptls2->sweep_objs);
     }
 }
 
@@ -608,18 +638,19 @@ static int64_t last_gc_total_bytes = 0;
 // under this limit, but we will go above it rather than halting.
 #ifdef _P64
 typedef uint64_t memsize_t;
-#define default_collect_interval (5600*1024*sizeof(void*))
-static size_t max_collect_interval = 1250000000UL;
-// Eventually we can expose this to the user/ci.
-memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024;
+static const size_t default_collect_interval = 5600 * 1024 * sizeof(void*);
+static const size_t max_collect_interval = 1250000000UL;
+static size_t total_mem;
+// We expose this to the user/ci as jl_gc_set_max_memory
+static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024;
 #else
 typedef uint32_t memsize_t;
-#define default_collect_interval (3200*1024*sizeof(void*))
-static size_t max_collect_interval =  500000000UL;
+static const size_t default_collect_interval = 3200 * 1024 * sizeof(void*);
+static const size_t max_collect_interval =  500000000UL;
 // Work really hard to stay within 2GB
 // Alternative is to risk running out of address space
 // on 32 bit architectures.
-memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024;
+static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024;
 #endif
 
 // global variables for GC stats
@@ -725,9 +756,11 @@ static void gc_sync_cache(jl_ptls_t ptls) JL_NOTSAFEPOINT
 // No other threads can be running marking at the same time
 static void gc_sync_all_caches_nolock(jl_ptls_t ptls)
 {
-    for (int t_i = 0; t_i < jl_n_threads; t_i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[t_i];
-        gc_sync_cache_nolock(ptls, &ptls2->gc_cache);
+    assert(gc_n_threads);
+    for (int t_i = 0; t_i < gc_n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+        if (ptls2)
+            gc_sync_cache_nolock(ptls, &ptls2->gc_cache);
     }
 }
 
@@ -934,8 +967,11 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls,
 
 static void clear_weak_refs(void)
 {
-    for (int i = 0; i < jl_n_threads; i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[i];
+    assert(gc_n_threads);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 == NULL)
+            continue;
         size_t n, l = ptls2->heap.weak_refs.len;
         void **lst = ptls2->heap.weak_refs.items;
         for (n = 0; n < l; n++) {
@@ -948,8 +984,11 @@ static void clear_weak_refs(void)
 
 static void sweep_weak_refs(void)
 {
-    for (int i = 0; i < jl_n_threads; i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[i];
+    assert(gc_n_threads);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 == NULL)
+            continue;
         size_t n = 0;
         size_t ndel = 0;
         size_t l = ptls2->heap.weak_refs.len;
@@ -1066,11 +1105,16 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT
 static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
 {
     gc_time_big_start();
-    for (int i = 0;i < jl_n_threads;i++)
-        sweep_big_list(sweep_full, &jl_all_tls_states[i]->heap.big_objects);
+    assert(gc_n_threads);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 == NULL)
+            continue;
+        sweep_big_list(sweep_full, &ptls2->heap.big_objects);
+    }
     if (sweep_full) {
         bigval_t **last_next = sweep_big_list(sweep_full, &big_objects_marked);
-        // Move all survivors from big_objects_marked list to big_objects list.
+        // Move all survivors from big_objects_marked list to the big_objects list of this thread.
         if (ptls->heap.big_objects)
             ptls->heap.big_objects->prev = last_next;
         *last_next = ptls->heap.big_objects;
@@ -1109,8 +1153,12 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 
 static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
 {
-    for (int i = 0; i < jl_n_threads; i++) {
-        jl_ptls_t ptls = jl_all_tls_states[i];
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls) {
             dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval);
             dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed);
@@ -1125,8 +1173,12 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
 
 static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
 {
-    for (int i = 0; i < jl_n_threads; i++) {
-        jl_ptls_t ptls = jl_all_tls_states[i];
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls) {
             memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
             jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
@@ -1173,8 +1225,11 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
 static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT
 {
     gc_time_mallocd_array_start();
-    for (int t_i = 0;t_i < jl_n_threads;t_i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[t_i];
+    assert(gc_n_threads);
+    for (int t_i = 0; t_i < gc_n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+        if (ptls2 == NULL)
+            continue;
         mallocarray_t *ma = ptls2->heap.mallocarrays;
         mallocarray_t **pma = &ptls2->heap.mallocarrays;
         while (ma != NULL) {
@@ -1198,11 +1253,10 @@ static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT
 }
 
 // pool allocation
-static inline jl_taggedvalue_t *reset_page(const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT
+static inline jl_taggedvalue_t *reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT
 {
     assert(GC_PAGE_OFFSET >= sizeof(void*));
     pg->nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / p->osize;
-    jl_ptls_t ptls2 = jl_all_tls_states[pg->thread_n];
     pg->pool_n = p - ptls2->heap.norm_pools;
     memset(pg->ages, 0, GC_PAGE_SZ / 8 / p->osize + 1);
     jl_taggedvalue_t *beg = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET);
@@ -1240,7 +1294,7 @@ static NOINLINE jl_taggedvalue_t *add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT
     pg->osize = p->osize;
     pg->ages = (uint8_t*)malloc_s(GC_PAGE_SZ / 8 / p->osize + 1);
     pg->thread_n = ptls->tid;
-    jl_taggedvalue_t *fl = reset_page(p, pg, NULL);
+    jl_taggedvalue_t *fl = reset_page(ptls, p, pg, NULL);
     p->newpages = fl;
     return fl;
 }
@@ -1352,7 +1406,8 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t
         // FIXME - need to do accounting on a per-thread basis
         // on quick sweeps, keep a few pages empty but allocated for performance
         if (!sweep_full && lazy_freed_pages <= default_collect_interval / GC_PAGE_SZ) {
-            jl_taggedvalue_t *begin = reset_page(p, pg, p->newpages);
+            jl_ptls_t ptls2 = gc_all_tls_states[pg->thread_n];
+            jl_taggedvalue_t *begin = reset_page(ptls2, p, pg, p->newpages);
             p->newpages = begin;
             begin->next = (jl_taggedvalue_t*)0;
             lazy_freed_pages++;
@@ -1454,7 +1509,7 @@ static inline void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg
 {
     int p_n = pg->pool_n;
     int t_n = pg->thread_n;
-    jl_ptls_t ptls2 = jl_all_tls_states[t_n];
+    jl_ptls_t ptls2 = gc_all_tls_states[t_n];
     jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n];
     int osize = pg->osize;
     pfl[t_n * JL_GC_N_POOLS + p_n] = sweep_page(p, pg, pfl[t_n * JL_GC_N_POOLS + p_n], sweep_full, osize);
@@ -1566,9 +1621,9 @@ static void gc_sweep_pool(int sweep_full)
     gc_time_pool_start();
     lazy_freed_pages = 0;
 
-    // For the benfit of the analyzer, which doesn't know that jl_n_threads
+    // For the benefit of the analyzer, which doesn't know that gc_n_threads
     // doesn't change over the course of this function
-    size_t n_threads = jl_n_threads;
+    size_t n_threads = gc_n_threads;
 
     // allocate enough space to hold the end of the free list chain
     // for every thread and pool size
@@ -1577,7 +1632,13 @@ static void gc_sweep_pool(int sweep_full)
     // update metadata of pages that were pointed to by freelist or newpages from a pool
     // i.e. pages being the current allocation target
     for (int t_i = 0; t_i < n_threads; t_i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[t_i];
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+        if (ptls2 == NULL) {
+            for (int i = 0; i < JL_GC_N_POOLS; i++) {
+                pfl[t_i * JL_GC_N_POOLS + i] = NULL;
+            }
+            continue;
+        }
         for (int i = 0; i < JL_GC_N_POOLS; i++) {
             jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
             jl_taggedvalue_t *last = p->freelist;
@@ -1606,6 +1667,9 @@ static void gc_sweep_pool(int sweep_full)
 
     // null out terminal pointers of free lists
     for (int t_i = 0; t_i < n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+        if (ptls2 == NULL)
+            continue;
         for (int i = 0; i < JL_GC_N_POOLS; i++) {
             *pfl[t_i * JL_GC_N_POOLS + i] = NULL;
         }
@@ -2360,10 +2424,13 @@ stack: {
                 }
                 else {
                     new_obj = (jl_value_t*)gc_read_stack(&rts[i], offset, lb, ub);
-                    if (gc_ptr_tag(new_obj, 1)) {
+                    if (gc_ptr_tag(new_obj, 3)) {
                         // handle tagged pointers in finalizer list
                         new_obj = gc_ptr_clear_tag(new_obj, 1);
+                        // skip over the finalizer fptr
                         i++;
+                        if (gc_ptr_tag(new_obj, 2))
+                            continue;
                     }
                 }
                 if (!gc_try_setmark(new_obj, &nptr, &tag, &bits))
@@ -2537,6 +2604,8 @@ finlist: {
             new_obj = *begin;
             if (__unlikely(!new_obj))
                 continue;
+            if (gc_ptr_tag(new_obj, 2))
+                continue;
             if (gc_ptr_tag(new_obj, 1)) {
                 new_obj = (jl_value_t*)gc_ptr_clear_tag(new_obj, 1);
                 begin++;
@@ -2716,7 +2785,7 @@ mark: {
                 int16_t tid = jl_atomic_load_relaxed(&ta->tid);
                 gc_invoke_callbacks(jl_gc_cb_task_scanner_t,
                     gc_cblist_task_scanner,
-                    (ta, tid != -1 && ta == jl_all_tls_states[tid]->root_task));
+                    (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task));
                 import_gc_state(ptls, &sp);
             }
 #ifdef COPY_STACKS
@@ -2733,7 +2802,7 @@ mark: {
             if (stkbuf && ta->copy_stack && ta->ptls == NULL) {
                 int16_t tid = jl_atomic_load_relaxed(&ta->tid);
                 assert(tid >= 0);
-                jl_ptls_t ptls2 = jl_all_tls_states[tid];
+                jl_ptls_t ptls2 = gc_all_tls_states[tid];
                 ub = (uintptr_t)ptls2->stackbase;
                 lb = ub - ta->copy_stack;
                 offset = (uintptr_t)stkbuf - lb;
@@ -2843,12 +2912,19 @@ mark: {
 static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp,
                                      jl_ptls_t ptls2)
 {
-    gc_mark_queue_obj(gc_cache, sp, jl_atomic_load_relaxed(&ptls2->current_task));
-    gc_mark_queue_obj(gc_cache, sp, ptls2->root_task);
-    if (ptls2->next_task)
-        gc_mark_queue_obj(gc_cache, sp, ptls2->next_task);
-    if (ptls2->previous_task) // shouldn't be necessary, but no reason not to
-        gc_mark_queue_obj(gc_cache, sp, ptls2->previous_task);
+    jl_task_t *task;
+    task = ptls2->root_task;
+    if (task)
+        gc_mark_queue_obj(gc_cache, sp, task);
+    task = jl_atomic_load_relaxed(&ptls2->current_task);
+    if (task)
+        gc_mark_queue_obj(gc_cache, sp, task);
+    task = ptls2->next_task;
+    if (task)
+        gc_mark_queue_obj(gc_cache, sp, task);
+    task = ptls2->previous_task;
+    if (task) // shouldn't be necessary, but no reason not to
+        gc_mark_queue_obj(gc_cache, sp, task);
     if (ptls2->previous_exception)
         gc_mark_queue_obj(gc_cache, sp, ptls2->previous_exception);
 }
@@ -2897,17 +2973,25 @@ static void sweep_finalizer_list(arraylist_t *list)
     size_t j = 0;
     for (size_t i=0; i < len; i+=2) {
         void *v0 = items[i];
-        void *v = gc_ptr_clear_tag(v0, 1);
+        void *v = gc_ptr_clear_tag(v0, 3);
         if (__unlikely(!v0)) {
             // remove from this list
             continue;
         }
 
         void *fin = items[i+1];
-        int isfreed = !gc_marked(jl_astaggedvalue(v)->bits.gc);
-        int isold = (list != &finalizer_list_marked &&
+        int isfreed;
+        int isold;
+        if (gc_ptr_tag(v, 2)) {
+            isfreed = 1;
+            isold = 0;
+        }
+        else {
+            isfreed = !gc_marked(jl_astaggedvalue(v)->bits.gc);
+            isold = (list != &finalizer_list_marked &&
                      jl_astaggedvalue(v)->bits.gc == GC_OLD_MARKED &&
                      jl_astaggedvalue(fin)->bits.gc == GC_OLD_MARKED);
+        }
         if (isfreed || isold) {
             // remove from this list
         }
@@ -3090,11 +3174,18 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     uint64_t start_mark_time = jl_hrtime();
 
     // 1. fix GC bits of objects in the remset.
-    for (int t_i = 0; t_i < jl_n_threads; t_i++)
-        jl_gc_premark(jl_all_tls_states[t_i]);
+    assert(gc_n_threads);
+    for (int t_i = 0; t_i < gc_n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+        if (ptls2 != NULL)
+            jl_gc_premark(ptls2);
+    }
 
-    for (int t_i = 0; t_i < jl_n_threads; t_i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[t_i];
+    assert(gc_n_threads);
+    for (int t_i = 0; t_i < gc_n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+        if (ptls2 == NULL)
+            continue;
         // 2.1. mark every object in the `last_remsets` and `rem_binding`
         jl_gc_queue_remset(gc_cache, &sp, ptls2);
         // 2.2. mark every thread local root
@@ -3130,16 +3221,22 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     // mark the object moved to the marked list from the
     // `finalizer_list` by `sweep_finalizer_list`
     size_t orig_marked_len = finalizer_list_marked.len;
-    for (int i = 0;i < jl_n_threads;i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[i];
+    assert(gc_n_threads);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 == NULL)
+            continue;
         sweep_finalizer_list(&ptls2->finalizers);
     }
     if (prev_sweep_full) {
         sweep_finalizer_list(&finalizer_list_marked);
         orig_marked_len = 0;
     }
-    for (int i = 0;i < jl_n_threads;i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[i];
+    assert(gc_n_threads);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 == NULL)
+            continue;
         gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0);
     }
     gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, orig_marked_len);
@@ -3178,8 +3275,13 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     // 5. next collection decision
     int not_freed_enough = (collection == JL_GC_AUTO) && estimate_freed < (7*(actual_allocd/10));
     int nptr = 0;
-    for (int i = 0;i < jl_n_threads;i++)
-        nptr += jl_all_tls_states[i]->heap.remset_nptr;
+    assert(gc_n_threads);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 == NULL)
+            continue;
+        nptr += ptls2->heap.remset_nptr;
+    }
 
     // many pointers in the intergen frontier => "quick" mark is not quick
     int large_frontier = nptr*sizeof(void*) >= default_collect_interval;
@@ -3194,9 +3296,16 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         if (large_frontier) {
             sweep_full = 1;
         }
-        if (gc_num.interval > max_collect_interval) {
+        size_t maxmem = 0;
+#ifdef _P64
+        // on a big memory machine, increase max_collect_interval to totalmem / nthreads / 2
+        maxmem = total_mem / gc_n_threads / 2;
+#endif
+        if (maxmem < max_collect_interval)
+            maxmem = max_collect_interval;
+        if (gc_num.interval > maxmem) {
             sweep_full = 1;
-            gc_num.interval = max_collect_interval;
+            gc_num.interval = maxmem;
         }
     }
 
@@ -3244,8 +3353,11 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     // sweeping is over
     // 6. if it is a quick sweep, put back the remembered objects in queued state
     // so that we don't trigger the barrier again on them.
-    for (int t_i = 0;t_i < jl_n_threads;t_i++) {
-        jl_ptls_t ptls2 = jl_all_tls_states[t_i];
+    assert(gc_n_threads);
+    for (int t_i = 0; t_i < gc_n_threads; t_i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+        if (ptls2 == NULL)
+            continue;
         if (!sweep_full) {
             for (int i = 0; i < ptls2->heap.remset->len; i++) {
                 jl_astaggedvalue(ptls2->heap.remset->items[i])->bits.gc = GC_MARKED;
@@ -3357,9 +3469,17 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
 #endif
     // Now we are ready to wait for other threads to hit the safepoint,
     // we can do a few things that doesn't require synchronization.
-    // TODO (concurrently queue objects)
-    // no-op for non-threading
-    jl_gc_wait_for_the_world();
+    //
+    // We must sync here with the tls_lock operations, so that we have a
+    // seq-cst order between these events now we know that either the new
+    // thread must run into our safepoint flag or we must observe the
+    // existence of the thread in the jl_n_threads count.
+    //
+    // TODO: concurrently queue objects
+    jl_fence();
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    jl_gc_wait_for_the_world(gc_all_tls_states, gc_n_threads);
     JL_PROBE_GC_STOP_THE_WORLD();
 
     uint64_t t1 = jl_hrtime();
@@ -3382,7 +3502,8 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
         JL_UNLOCK_NOGC(&finalizers_lock);
     }
 
-    // no-op for non-threading
+    gc_n_threads = 0;
+    gc_all_tls_states = NULL;
     jl_safepoint_end_gc();
     jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
     JL_PROBE_GC_END();
@@ -3391,10 +3512,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     // Doing this on all threads is racy (it's impossible to check
     // or wait for finalizers on other threads without dead lock).
     if (!ptls->finalizers_inhibited && ptls->locks.len == 0) {
-        int8_t was_in_finalizer = ptls->in_finalizer;
-        ptls->in_finalizer = 1;
         run_finalizers(ct);
-        ptls->in_finalizer = was_in_finalizer;
     }
     JL_PROBE_GC_FINALIZER();
 
@@ -3409,8 +3527,12 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
 void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp)
 {
     jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
-    for (size_t i = 0; i < jl_n_threads; i++)
-        jl_gc_queue_thread_local(gc_cache, sp, jl_all_tls_states[i]);
+    assert(gc_n_threads);
+    for (size_t i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2)
+            jl_gc_queue_thread_local(gc_cache, sp, ptls2);
+    }
     mark_roots(gc_cache, sp);
 }
 
@@ -3424,8 +3546,6 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
 // Per-thread initialization
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
-    if (ptls->tid == 0)
-        ptls->disable_gc = 1;
     jl_thread_heap_t *heap = &ptls->heap;
     jl_gc_pool_t *p = heap->norm_pools;
     for (int i = 0; i < JL_GC_N_POOLS; i++) {
@@ -3480,14 +3600,10 @@ void jl_gc_init(void)
     gc_num.max_memory = 0;
 
 #ifdef _P64
-    // on a big memory machine, set max_collect_interval to totalmem / nthreads / 2
-    uint64_t total_mem = uv_get_total_memory();
+    total_mem = uv_get_total_memory();
     uint64_t constrained_mem = uv_get_constrained_memory();
     if (constrained_mem > 0 && constrained_mem < total_mem)
         total_mem = constrained_mem;
-    size_t maxmem = total_mem / jl_n_threads / 2;
-    if (maxmem > max_collect_interval)
-        max_collect_interval = maxmem;
 #endif
 
     // We allocate with abandon until we get close to the free memory on the machine.
@@ -3942,7 +4058,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
             goto valid_object;
         }
         jl_gc_pool_t *pool =
-            jl_all_tls_states[meta->thread_n]->heap.norm_pools +
+            gc_all_tls_states[meta->thread_n]->heap.norm_pools +
             meta->pool_n;
         if (meta->fl_begin_offset == (uint16_t) -1) {
             // case 2: this is a page on the newpages list
diff --git a/src/gc.h b/src/gc.h
index 00c3d48b52935..29c904b796e45 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -393,6 +393,8 @@ extern bigval_t *big_objects_marked;
 extern arraylist_t finalizer_list_marked;
 extern arraylist_t to_finalize;
 extern int64_t lazy_freed_pages;
+extern int gc_n_threads;
+extern jl_ptls_t* gc_all_tls_states;
 
 STATIC_INLINE bigval_t *bigval_header(jl_taggedvalue_t *o) JL_NOTSAFEPOINT
 {
diff --git a/src/gf.c b/src/gf.c
index 1d36589a082f5..60a267873a321 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -2021,8 +2021,7 @@ static void record_precompile_statement(jl_method_instance_t *mi)
     if (!jl_is_method(def))
         return;
 
-    if (jl_n_threads > 1)
-        JL_LOCK(&precomp_statement_out_lock);
+    JL_LOCK(&precomp_statement_out_lock);
     if (s_precompile == NULL) {
         const char *t = jl_options.trace_compile;
         if (!strncmp(t, "stderr", 6)) {
@@ -2041,8 +2040,7 @@ static void record_precompile_statement(jl_method_instance_t *mi)
         if (s_precompile != JL_STDERR)
             ios_flush(&f_precompile);
     }
-    if (jl_n_threads > 1)
-        JL_UNLOCK(&precomp_statement_out_lock);
+    JL_UNLOCK(&precomp_statement_out_lock);
 }
 
 jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t world)
diff --git a/src/init.c b/src/init.c
index a5632fc66a45a..d5e3c60698af3 100644
--- a/src/init.c
+++ b/src/init.c
@@ -211,11 +211,9 @@ static void jl_close_item_atexit(uv_handle_t *handle)
 
 JL_DLLEXPORT void jl_atexit_hook(int exitcode)
 {
-    if (jl_all_tls_states == NULL)
+    if (jl_atomic_load_relaxed(&jl_all_tls_states) == NULL)
         return;
 
-    jl_task_t *ct = jl_current_task;
-
     if (exitcode == 0)
         jl_write_compiler_output();
     jl_print_gc_stats(JL_STDERR);
@@ -223,7 +221,14 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode)
         jl_write_coverage_data(jl_options.output_code_coverage);
     if (jl_options.malloc_log)
         jl_write_malloc_log();
+    int8_t old_state;
+    jl_task_t *ct = NULL;
     if (jl_base_module) {
+        ct = jl_get_current_task();
+        if (ct == NULL) {
+            ct = container_of(jl_adopt_thread(), jl_task_t, gcstack);
+        }
+        old_state = jl_gc_unsafe_enter(ct->ptls);
         jl_value_t *f = jl_get_global(jl_base_module, jl_symbol("_atexit"));
         if (f != NULL) {
             JL_TRY {
@@ -246,7 +251,8 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode)
     JL_STDOUT = (uv_stream_t*) STDOUT_FILENO;
     JL_STDERR = (uv_stream_t*) STDERR_FILENO;
 
-    jl_gc_run_all_finalizers(ct);
+    if (ct != NULL)
+        jl_gc_run_all_finalizers(ct);
 
     uv_loop_t *loop = jl_global_event_loop();
     if (loop != NULL) {
@@ -295,11 +301,13 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode)
 #endif
 
     jl_teardown_codegen();
+    if (ct != NULL)
+        jl_gc_unsafe_leave(ct->ptls, old_state);
 }
 
 JL_DLLEXPORT void jl_postoutput_hook(void)
 {
-    if (jl_all_tls_states == NULL)
+    if (jl_atomic_load_relaxed(&jl_all_tls_states) == NULL)
         return;
 
     if (jl_base_module) {
@@ -777,7 +785,7 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_
 
     if (jl_base_module == NULL) {
         // nthreads > 1 requires code in Base
-        jl_n_threads = 1;
+        jl_atomic_store_relaxed(&jl_n_threads, 1);
     }
     jl_start_threads();
 
diff --git a/src/jl_exported_data.inc b/src/jl_exported_data.inc
index eae13a4ff285e..e426f2c1f8b42 100644
--- a/src/jl_exported_data.inc
+++ b/src/jl_exported_data.inc
@@ -130,7 +130,7 @@
 // Data symbols that are defined inside the public libjulia
 #define JL_EXPORTED_DATA_SYMBOLS(XX) \
     XX(jl_n_threadpools, int) \
-    XX(jl_n_threads, int) \
+    XX(jl_n_threads, _Atomic(int)) \
     XX(jl_options, jl_options_t) \
 
 // end of file
diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
index 1bd324325c57b..186dfc4a98ca7 100644
--- a/src/jl_exported_funcs.inc
+++ b/src/jl_exported_funcs.inc
@@ -3,6 +3,7 @@
 #define JL_RUNTIME_EXPORTED_FUNCS(XX) \
     XX(jl_active_task_stack) \
     XX(jl_add_standard_imports) \
+    XX(jl_adopt_thread) \
     XX(jl_alignment) \
     XX(jl_alloc_array_1d) \
     XX(jl_alloc_array_2d) \
@@ -150,6 +151,7 @@
     XX(jl_gc_add_finalizer) \
     XX(jl_gc_add_finalizer_th) \
     XX(jl_gc_add_ptr_finalizer) \
+    XX(jl_gc_add_quiescent) \
     XX(jl_gc_allocobj) \
     XX(jl_gc_alloc_0w) \
     XX(jl_gc_alloc_1w) \
diff --git a/src/jlapi.c b/src/jlapi.c
index d1fb1e5aacf25..53a6c9b3c6859 100644
--- a/src/jlapi.c
+++ b/src/jlapi.c
@@ -96,9 +96,15 @@ JL_DLLEXPORT void jl_init_with_image__threading(const char *julia_bindir,
     jl_init_with_image(julia_bindir, image_relative_path);
 }
 
+static void _jl_exception_clear(jl_task_t *ct) JL_NOTSAFEPOINT
+{
+    ct->ptls->previous_exception = NULL;
+}
+
 JL_DLLEXPORT jl_value_t *jl_eval_string(const char *str)
 {
     jl_value_t *r;
+    jl_task_t *ct = jl_current_task;
     JL_TRY {
         const char filename[] = "none";
         jl_value_t *ast = jl_parse_all(str, strlen(str),
@@ -106,10 +112,10 @@ JL_DLLEXPORT jl_value_t *jl_eval_string(const char *str)
         JL_GC_PUSH1(&ast);
         r = jl_toplevel_eval_in(jl_main_module, ast);
         JL_GC_POP();
-        jl_exception_clear();
+        _jl_exception_clear(ct);
     }
     JL_CATCH {
-        jl_current_task->ptls->previous_exception = jl_current_exception();
+        ct->ptls->previous_exception = jl_current_exception();
         r = NULL;
     }
     return r;
@@ -128,7 +134,7 @@ JL_DLLEXPORT jl_value_t *jl_exception_occurred(void)
 
 JL_DLLEXPORT void jl_exception_clear(void)
 {
-    jl_current_task->ptls->previous_exception = NULL;
+    _jl_exception_clear(jl_current_task);
 }
 
 // get the name of a type as a string
@@ -181,7 +187,7 @@ JL_DLLEXPORT jl_value_t *jl_call(jl_function_t *f, jl_value_t **args, uint32_t n
         v = jl_apply(argv, nargs);
         ct->world_age = last_age;
         JL_GC_POP();
-        jl_exception_clear();
+        _jl_exception_clear(ct);
     }
     JL_CATCH {
         ct->ptls->previous_exception = jl_current_exception();
@@ -201,7 +207,7 @@ JL_DLLEXPORT jl_value_t *jl_call0(jl_function_t *f)
         v = jl_apply_generic(f, NULL, 0);
         ct->world_age = last_age;
         JL_GC_POP();
-        jl_exception_clear();
+        _jl_exception_clear(ct);
     }
     JL_CATCH {
         ct->ptls->previous_exception = jl_current_exception();
@@ -224,7 +230,7 @@ JL_DLLEXPORT jl_value_t *jl_call1(jl_function_t *f, jl_value_t *a)
         v = jl_apply(argv, 2);
         ct->world_age = last_age;
         JL_GC_POP();
-        jl_exception_clear();
+        _jl_exception_clear(ct);
     }
     JL_CATCH {
         ct->ptls->previous_exception = jl_current_exception();
@@ -248,7 +254,7 @@ JL_DLLEXPORT jl_value_t *jl_call2(jl_function_t *f, jl_value_t *a, jl_value_t *b
         v = jl_apply(argv, 3);
         ct->world_age = last_age;
         JL_GC_POP();
-        jl_exception_clear();
+        _jl_exception_clear(ct);
     }
     JL_CATCH {
         ct->ptls->previous_exception = jl_current_exception();
@@ -261,6 +267,7 @@ JL_DLLEXPORT jl_value_t *jl_call3(jl_function_t *f, jl_value_t *a,
                                   jl_value_t *b, jl_value_t *c)
 {
     jl_value_t *v;
+    jl_task_t *ct = jl_current_task;
     JL_TRY {
         jl_value_t **argv;
         JL_GC_PUSHARGS(argv, 4);
@@ -268,16 +275,15 @@ JL_DLLEXPORT jl_value_t *jl_call3(jl_function_t *f, jl_value_t *a,
         argv[1] = a;
         argv[2] = b;
         argv[3] = c;
-        jl_task_t *ct = jl_current_task;
         size_t last_age = ct->world_age;
         ct->world_age = jl_get_world_counter();
         v = jl_apply(argv, 4);
         ct->world_age = last_age;
         JL_GC_POP();
-        jl_exception_clear();
+        _jl_exception_clear(ct);
     }
     JL_CATCH {
-        jl_current_task->ptls->previous_exception = jl_current_exception();
+        ct->ptls->previous_exception = jl_current_exception();
         v = NULL;
     }
     return v;
@@ -560,8 +566,8 @@ static NOINLINE int true_main(int argc, char *argv[])
         (jl_function_t*)jl_get_global(jl_base_module, jl_symbol("_start")) : NULL;
 
     if (start_client) {
+        jl_task_t *ct = jl_current_task;
         JL_TRY {
-            jl_task_t *ct = jl_current_task;
             size_t last_age = ct->world_age;
             ct->world_age = jl_get_world_counter();
             jl_apply(&start_client, 1);
diff --git a/src/julia.h b/src/julia.h
index 19a94d8f2f15e..c06bb8e7fd5a0 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -895,6 +895,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t);
 
 JL_DLLEXPORT void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_finalize(jl_value_t *o);
 JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value);
 JL_DLLEXPORT jl_value_t *jl_gc_alloc_0w(void);
@@ -1658,7 +1659,7 @@ JL_DLLEXPORT jl_sym_t *jl_get_UNAME(void) JL_NOTSAFEPOINT;
 JL_DLLEXPORT jl_sym_t *jl_get_ARCH(void) JL_NOTSAFEPOINT;
 JL_DLLEXPORT jl_value_t *jl_get_libllvm(void) JL_NOTSAFEPOINT;
 extern JL_DLLIMPORT int jl_n_threadpools;
-extern JL_DLLIMPORT int jl_n_threads;
+extern JL_DLLIMPORT _Atomic(int) jl_n_threads;
 extern JL_DLLIMPORT int *jl_n_threads_per_pool;
 
 // environment entries
@@ -1740,6 +1741,7 @@ JL_DLLEXPORT void jl_atexit_hook(int status);
 JL_DLLEXPORT void jl_postoutput_hook(void);
 JL_DLLEXPORT void JL_NORETURN jl_exit(int status);
 JL_DLLEXPORT const char *jl_pathname_for_handle(void *handle);
+JL_DLLEXPORT jl_gcframe_t **jl_adopt_thread(void);
 
 JL_DLLEXPORT int jl_deserialize_verify_header(ios_t *s);
 JL_DLLEXPORT void jl_preload_sysimg_so(const char *fname);
diff --git a/src/julia_gcext.h b/src/julia_gcext.h
index 6787dafb4b7ee..6523198474771 100644
--- a/src/julia_gcext.h
+++ b/src/julia_gcext.h
@@ -76,10 +76,10 @@ JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent,
 // Sweep functions will not automatically be called for objects of
 // foreign types, as that may not always be desired. Only calling
 // jl_gc_schedule_foreign_sweepfunc() on an object of a foreign type
-// will result in the custome sweep function actually being called.
+// will result in the custom sweep function actually being called.
 // This must be done at most once per object and should usually be
 // done right after allocating the object.
-JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t * bj);
+JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *bj);
 
 // The following functions enable support for conservative marking. This
 // functionality allows the user to determine if a machine word can be
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 1fab161a4766e..00072da00fd7d 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2684,12 +2684,12 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(std::vector<int> &Colors, State
         // Insert GC frame stores
         PlaceGCFrameStores(S, AllocaSlot - 2, Colors, gcframe);
         // Insert GCFrame pops
-        for(Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-            if (isa<ReturnInst>(I->getTerminator())) {
+        for (auto &BB : *F) {
+            if (isa<ReturnInst>(BB.getTerminator())) {
                 auto popGcframe = CallInst::Create(
                     getOrDeclare(jl_intrinsics::popGCFrame),
                     {gcframe});
-                popGcframe->insertBefore(I->getTerminator());
+                popGcframe->insertBefore(BB.getTerminator());
             }
         }
     }
diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp
index f0c0c6ee77b44..3b55339984516 100644
--- a/src/llvm-pass-helpers.cpp
+++ b/src/llvm-pass-helpers.cpp
@@ -23,7 +23,7 @@ JuliaPassContext::JuliaPassContext()
 
         tbaa_gcframe(nullptr), tbaa_tag(nullptr),
 
-        pgcstack_getter(nullptr), gc_flush_func(nullptr),
+        pgcstack_getter(nullptr), adoptthread_func(nullptr), gc_flush_func(nullptr),
         gc_preserve_begin_func(nullptr), gc_preserve_end_func(nullptr),
         pointer_from_objref_func(nullptr), alloc_obj_func(nullptr),
         typeof_func(nullptr), write_barrier_func(nullptr),
@@ -44,6 +44,7 @@ void JuliaPassContext::initFunctions(Module &M)
     tbaa_tag = tbaa_make_child_with_context(llvmctx, "jtbaa_tag", tbaa_data_scalar).first;
 
     pgcstack_getter = M.getFunction("julia.get_pgcstack");
+    adoptthread_func = M.getFunction("julia.get_pgcstack_or_new");
     gc_flush_func = M.getFunction("julia.gcroot_flush");
     gc_preserve_begin_func = M.getFunction("llvm.julia.gc_preserve_begin");
     gc_preserve_end_func = M.getFunction("llvm.julia.gc_preserve_end");
@@ -70,10 +71,13 @@ void JuliaPassContext::initAll(Module &M)
 
 llvm::CallInst *JuliaPassContext::getPGCstack(llvm::Function &F) const
 {
-    for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end();
-         pgcstack_getter && I != E; ++I) {
-        if (CallInst *callInst = dyn_cast<CallInst>(&*I)) {
-            if (callInst->getCalledOperand() == pgcstack_getter) {
+    if (!pgcstack_getter && !adoptthread_func)
+        return nullptr;
+    for (auto &I : F.getEntryBlock()) {
+        if (CallInst *callInst = dyn_cast<CallInst>(&I)) {
+            Value *callee = callInst->getCalledOperand();
+            if ((pgcstack_getter && callee == pgcstack_getter) ||
+                (adoptthread_func && callee == adoptthread_func)) {
                 return callInst;
             }
         }
diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h
index 64d5dc00e2c5b..68f6efe42be6d 100644
--- a/src/llvm-pass-helpers.h
+++ b/src/llvm-pass-helpers.h
@@ -50,6 +50,7 @@ struct JuliaPassContext {
 
     // Intrinsics.
     llvm::Function *pgcstack_getter;
+    llvm::Function *adoptthread_func;
     llvm::Function *gc_flush_func;
     llvm::Function *gc_preserve_begin_func;
     llvm::Function *gc_preserve_end_func;
diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp
index e948e1c1a10bc..ad516d4ceb010 100644
--- a/src/llvm-ptls.cpp
+++ b/src/llvm-ptls.cpp
@@ -35,19 +35,19 @@ typedef Instruction TerminatorInst;
 namespace {
 
 struct LowerPTLS {
-    LowerPTLS(bool imaging_mode=false)
-        : imaging_mode(imaging_mode)
+    LowerPTLS(Module &M, bool imaging_mode=false)
+        : imaging_mode(imaging_mode), M(&M)
     {}
 
-    bool runOnModule(Module &M, bool *CFGModified);
+    bool run(bool *CFGModified);
 private:
     const bool imaging_mode;
     Module *M;
-    Function *pgcstack_getter;
-    MDNode *tbaa_const;
-    FunctionType *FT_pgcstack_getter;
-    PointerType *T_pgcstack_getter;
-    PointerType *T_pppjlvalue;
+    MDNode *tbaa_const{nullptr};
+    MDNode *tbaa_gcframe{nullptr};
+    FunctionType *FT_pgcstack_getter{nullptr};
+    PointerType *T_pgcstack_getter{nullptr};
+    PointerType *T_pppjlvalue{nullptr};
     GlobalVariable *pgcstack_func_slot{nullptr};
     GlobalVariable *pgcstack_key_slot{nullptr};
     GlobalVariable *pgcstack_offset{nullptr};
@@ -55,7 +55,7 @@ struct LowerPTLS {
     Instruction *emit_pgcstack_tp(Value *offset, Instruction *insertBefore) const;
     template<typename T> T *add_comdat(T *G) const;
     GlobalVariable *create_aliased_global(Type *T, StringRef name) const;
-    void fix_pgcstack_use(CallInst *pgcstack, bool *CFGModified);
+    void fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter, bool or_new, bool *CFGModified);
 };
 
 void LowerPTLS::set_pgcstack_attrs(CallInst *pgcstack) const
@@ -159,19 +159,77 @@ inline T *LowerPTLS::add_comdat(T *G) const
     return G;
 }
 
-void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, bool *CFGModified)
+void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter, bool or_new, bool *CFGModified)
 {
     if (pgcstack->use_empty()) {
         pgcstack->eraseFromParent();
         return;
     }
+    if (or_new) {
+        // pgcstack();
+        // if (pgcstack != nullptr)
+        //     last_gc_state = emit_gc_unsafe_enter(ctx);
+        //     phi = pgcstack;        // fast
+        // else
+        //     last_gc_state = gc_safe;
+        //     phi = adopt();         // slow
+        // use phi;
+        // if (!retboxed)
+        //     foreach(retinst)
+        //         emit_gc_unsafe_leave(ctx, last_gc_state);
+        auto phi = PHINode::Create(pgcstack->getType(), 2, "");
+        phi->insertAfter(pgcstack);
+        pgcstack->replaceAllUsesWith(phi);
+        MDBuilder MDB(pgcstack->getContext());
+        SmallVector<uint32_t, 2> Weights{9, 1};
+        TerminatorInst *fastTerm;
+        TerminatorInst *slowTerm;
+        auto cmp = new ICmpInst(phi, CmpInst::ICMP_NE, pgcstack, Constant::getNullValue(pgcstack->getType()));
+        SplitBlockAndInsertIfThenElse(cmp, phi, &fastTerm, &slowTerm,
+                                      MDB.createBranchWeights(Weights));
+        if (CFGModified)
+            *CFGModified = true;
+        // emit slow branch code
+        CallInst *adopt = cast<CallInst>(pgcstack->clone());
+        Function *adoptFunc = M->getFunction(XSTR(jl_adopt_thread));
+        if (adoptFunc == NULL) {
+            adoptFunc = Function::Create(pgcstack_getter->getFunctionType(),
+                pgcstack_getter->getLinkage(), pgcstack_getter->getAddressSpace(),
+                XSTR(jl_adopt_thread), M);
+            adoptFunc->copyAttributesFrom(pgcstack_getter);
+            adoptFunc->copyMetadata(pgcstack_getter, 0);
+        }
+        adopt->setCalledFunction(adoptFunc);
+        adopt->insertBefore(slowTerm);
+        phi->addIncoming(adopt, slowTerm->getParent());
+        // emit fast branch code
+        IRBuilder<> builder(fastTerm->getParent());
+        fastTerm->removeFromParent();
+        MDNode *tbaa = tbaa_gcframe;
+        Value *prior = emit_gc_unsafe_enter(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, pgcstack), tbaa));
+        builder.Insert(fastTerm);
+        phi->addIncoming(pgcstack, fastTerm->getParent());
+        // emit pre-return cleanup
+        if (CountTrackedPointers(pgcstack->getParent()->getParent()->getReturnType()).count == 0) {
+            auto last_gc_state = PHINode::Create(Type::getInt8Ty(pgcstack->getContext()), 2, "", phi);
+            // if we called jl_adopt_thread, we must end this cfunction back in the safe-state
+            last_gc_state->addIncoming(ConstantInt::get(Type::getInt8Ty(M->getContext()), JL_GC_STATE_SAFE), slowTerm->getParent());
+            last_gc_state->addIncoming(prior, fastTerm->getParent());
+            for (auto &BB : *pgcstack->getParent()->getParent()) {
+                if (isa<ReturnInst>(BB.getTerminator())) {
+                    IRBuilder<> builder(BB.getTerminator());
+                    emit_gc_unsafe_leave(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, phi), tbaa), last_gc_state);
+                }
+            }
+        }
+    }
 
     if (imaging_mode) {
         if (jl_tls_elf_support) {
             // if (offset != 0)
-            //     pgcstack = tp + offset;
+            //     pgcstack = tp + offset; // fast
             // else
-            //     pgcstack = getter();
+            //     pgcstack = getter();    // slow
             auto offset = new LoadInst(getSizeTy(pgcstack->getContext()), pgcstack_offset, "", false, pgcstack);
             offset->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const);
             offset->setMetadata(llvm::LLVMContext::MD_invariant_load, MDNode::get(pgcstack->getContext(), None));
@@ -184,7 +242,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, bool *CFGModified)
             SplitBlockAndInsertIfThenElse(cmp, pgcstack, &fastTerm, &slowTerm,
                                           MDB.createBranchWeights(Weights));
             if (CFGModified)
-            *CFGModified = true;
+                *CFGModified = true;
 
             auto fastTLS = emit_pgcstack_tp(offset, fastTerm);
             auto phi = PHINode::Create(T_pppjlvalue, 2, "", pgcstack);
@@ -248,37 +306,44 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, bool *CFGModified)
     }
 }
 
-bool LowerPTLS::runOnModule(Module &_M, bool *CFGModified)
+bool LowerPTLS::run(bool *CFGModified)
 {
-    M = &_M;
-    pgcstack_getter = M->getFunction("julia.get_pgcstack");
-    if (!pgcstack_getter)
-        return false;
+    bool need_init = true;
+    auto runOnGetter = [&](bool or_new) {
+        Function *pgcstack_getter = M->getFunction(or_new ? "julia.get_pgcstack_or_new" : "julia.get_pgcstack");
+        if (!pgcstack_getter)
+            return false;
 
-    tbaa_const = tbaa_make_child_with_context(_M.getContext(), "jtbaa_const", nullptr, true).first;
+        if (need_init) {
+            tbaa_const = tbaa_make_child_with_context(M->getContext(), "jtbaa_const", nullptr, true).first;
+            tbaa_gcframe = tbaa_make_child_with_context(M->getContext(), "jtbaa_gcframe").first;
 
-    FT_pgcstack_getter = pgcstack_getter->getFunctionType();
+            FT_pgcstack_getter = pgcstack_getter->getFunctionType();
 #if defined(_OS_DARWIN_)
-    assert(sizeof(jl_pgcstack_key_t) == sizeof(uintptr_t));
-    FT_pgcstack_getter = FunctionType::get(FT_pgcstack_getter->getReturnType(), {getSizeTy(_M.getContext())}, false);
+            assert(sizeof(jl_pgcstack_key_t) == sizeof(uintptr_t));
+            FT_pgcstack_getter = FunctionType::get(FT_pgcstack_getter->getReturnType(), {getSizeTy(M->getContext())}, false);
 #endif
-    T_pgcstack_getter = FT_pgcstack_getter->getPointerTo();
-    T_pppjlvalue = cast<PointerType>(FT_pgcstack_getter->getReturnType());
-    if (imaging_mode) {
-        pgcstack_func_slot = create_aliased_global(T_pgcstack_getter, "jl_pgcstack_func_slot");
-        pgcstack_key_slot = create_aliased_global(getSizeTy(_M.getContext()), "jl_pgcstack_key_slot"); // >= sizeof(jl_pgcstack_key_t)
-        pgcstack_offset = create_aliased_global(getSizeTy(_M.getContext()), "jl_tls_offset");
-    }
+            T_pgcstack_getter = FT_pgcstack_getter->getPointerTo();
+            T_pppjlvalue = cast<PointerType>(FT_pgcstack_getter->getReturnType());
+            if (imaging_mode) {
+                pgcstack_func_slot = create_aliased_global(T_pgcstack_getter, "jl_pgcstack_func_slot");
+                pgcstack_key_slot = create_aliased_global(getSizeTy(M->getContext()), "jl_pgcstack_key_slot"); // >= sizeof(jl_pgcstack_key_t)
+                pgcstack_offset = create_aliased_global(getSizeTy(M->getContext()), "jl_tls_offset");
+            }
+            need_init = false;
+        }
 
-    for (auto it = pgcstack_getter->user_begin(); it != pgcstack_getter->user_end();) {
-        auto call = cast<CallInst>(*it);
-        ++it;
-        assert(call->getCalledOperand() == pgcstack_getter);
-        fix_pgcstack_use(call, CFGModified);
-    }
-    assert(pgcstack_getter->use_empty());
-    pgcstack_getter->eraseFromParent();
-    return true;
+        for (auto it = pgcstack_getter->user_begin(); it != pgcstack_getter->user_end();) {
+            auto call = cast<CallInst>(*it);
+            ++it;
+            assert(call->getCalledOperand() == pgcstack_getter);
+            fix_pgcstack_use(call, pgcstack_getter, or_new, CFGModified);
+        }
+        assert(pgcstack_getter->use_empty());
+        pgcstack_getter->eraseFromParent();
+        return true;
+    };
+    return runOnGetter(false) + runOnGetter(true);
 }
 
 struct LowerPTLSLegacy: public ModulePass {
@@ -290,8 +355,8 @@ struct LowerPTLSLegacy: public ModulePass {
 
     bool imaging_mode;
     bool runOnModule(Module &M) override {
-        LowerPTLS lower(imaging_mode);
-        return lower.runOnModule(M, nullptr);
+        LowerPTLS lower(M, imaging_mode);
+        return lower.run(nullptr);
     }
 };
 
@@ -304,9 +369,9 @@ static RegisterPass<LowerPTLSLegacy> X("LowerPTLS", "LowerPTLS Pass",
 } // anonymous namespace
 
 PreservedAnalyses LowerPTLSPass::run(Module &M, ModuleAnalysisManager &AM) {
-    LowerPTLS lower(imaging_mode);
+    LowerPTLS lower(M, imaging_mode);
     bool CFGModified = false;
-    if (lower.runOnModule(M, &CFGModified)) {
+    if (lower.run(&CFGModified)) {
         if (CFGModified) {
             return PreservedAnalyses::none();
         } else {
diff --git a/src/partr.c b/src/partr.c
index eeb0d0f456d97..ec6bbe3e5720a 100644
--- a/src/partr.c
+++ b/src/partr.c
@@ -26,6 +26,9 @@ static const int16_t not_sleeping = 0;
 // it is acceptable for the thread to be sleeping.
 static const int16_t sleeping = 1;
 
+// this thread is dead.
+static const int16_t sleeping_like_the_dead JL_UNUSED = 2;
+
 // invariant: No thread is ever asleep unless sleep_check_state is sleeping (or we have a wakeup signal pending).
 // invariant: Any particular thread is not asleep unless that thread's sleep_check_state is sleeping.
 // invariant: The transition of a thread state to sleeping must be followed by a check that there wasn't work pending for it.
@@ -182,7 +185,7 @@ static int sleep_check_after_threshold(uint64_t *start_cycles)
 
 static int wake_thread(int16_t tid)
 {
-    jl_ptls_t other = jl_all_tls_states[tid];
+    jl_ptls_t other = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
     int8_t state = sleeping;
 
     if (jl_atomic_load_relaxed(&other->sleep_check_state) == sleeping) {
@@ -229,7 +232,7 @@ JL_DLLEXPORT void jl_wakeup_thread(int16_t tid)
         if (wake_thread(tid)) {
             // check if we need to notify uv_run too
             jl_fence();
-            jl_ptls_t other = jl_all_tls_states[tid];
+            jl_ptls_t other = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
             jl_task_t *tid_task = jl_atomic_load_relaxed(&other->current_task);
             // now that we have changed the thread to not-sleeping, ensure that
             // either it has not yet acquired the libuv lock, or that it will
@@ -244,7 +247,8 @@ JL_DLLEXPORT void jl_wakeup_thread(int16_t tid)
         // in the future, we might want to instead wake some fraction of threads,
         // and let each of those wake additional threads if they find work
         int anysleep = 0;
-        for (tid = 0; tid < jl_n_threads; tid++) {
+        int nthreads = jl_atomic_load_acquire(&jl_n_threads);
+        for (tid = 0; tid < nthreads; tid++) {
             if (tid != self)
                 anysleep |= wake_thread(tid);
         }
diff --git a/src/safepoint.c b/src/safepoint.c
index b2feccf74e068..1ff26d616a5d8 100644
--- a/src/safepoint.c
+++ b/src/safepoint.c
@@ -111,10 +111,6 @@ void jl_safepoint_init(void)
 
 int jl_safepoint_start_gc(void)
 {
-    if (jl_n_threads == 1) {
-        jl_atomic_store_relaxed(&jl_gc_running, 1);
-        return 1;
-    }
     // The thread should have set this already
     assert(jl_atomic_load_relaxed(&jl_current_task->ptls->gc_state) == JL_GC_STATE_WAITING);
     uv_mutex_lock(&safepoint_lock);
@@ -137,10 +133,6 @@ int jl_safepoint_start_gc(void)
 void jl_safepoint_end_gc(void)
 {
     assert(jl_atomic_load_relaxed(&jl_gc_running));
-    if (jl_n_threads == 1) {
-        jl_atomic_store_relaxed(&jl_gc_running, 0);
-        return;
-    }
     uv_mutex_lock(&safepoint_lock);
     // Need to reset the page protection before resetting the flag since
     // the thread will trigger a segfault immediately after returning from
diff --git a/src/signal-handling.c b/src/signal-handling.c
index 43782bf4070f2..698b1e21febb1 100644
--- a/src/signal-handling.c
+++ b/src/signal-handling.c
@@ -182,14 +182,10 @@ static int *profile_get_randperm(int size)
 
 JL_DLLEXPORT int jl_profile_is_buffer_full(void)
 {
-    // declare buffer full if there isn't enough room to take samples across all threads
-    #if defined(_OS_WINDOWS_)
-        uint64_t nthreads = 1; // windows only profiles the main thread
-    #else
-        uint64_t nthreads = jl_n_threads;
-    #endif
-    // the `+ 6` is for the two block terminators `0` plus 4 metadata entries
-    return bt_size_cur + (((JL_BT_MAX_ENTRY_SIZE + 1) + 6) * nthreads) > bt_size_max;
+    // Declare buffer full if there isn't enough room to sample even just the
+    // thread metadata and one max-sized frame. The `+ 6` is for the two block
+    // terminator `0`'s plus the 4 metadata entries.
+    return bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > bt_size_max;
 }
 
 static uint64_t jl_last_sigint_trigger = 0;
diff --git a/src/signals-mach.c b/src/signals-mach.c
index 5a1816a80f2b2..edc2b42215f67 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -50,7 +50,7 @@ void jl_mach_gc_end(void)
         uintptr_t item = (uintptr_t)suspended_threads.items[i];
         int16_t tid = (int16_t)item;
         int8_t gc_state = (int8_t)(item >> 8);
-        jl_ptls_t ptls2 = jl_all_tls_states[tid];
+        jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
         jl_atomic_store_release(&ptls2->gc_state, gc_state);
         thread_resume(pthread_mach_thread_np(ptls2->system_id));
     }
@@ -119,7 +119,8 @@ static void allocate_mach_handler()
     if (_keymgr_set_lockmode_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, NM_ALLOW_RECURSION))
         jl_error("_keymgr_set_lockmode_processwide_ptr failed");
 
-    arraylist_new(&suspended_threads, jl_n_threads);
+    int16_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    arraylist_new(&suspended_threads, nthreads); // we will resize later (inside safepoint_lock), if needed
     pthread_t thread;
     pthread_attr_t attr;
     kern_return_t ret;
@@ -221,7 +222,7 @@ static void jl_throw_in_thread(int tid, mach_port_t thread, jl_value_t *exceptio
     host_thread_state_t state;
     kern_return_t ret = thread_get_state(thread, MACH_THREAD_STATE, (thread_state_t)&state, &count);
     HANDLE_MACH_ERROR("thread_get_state", ret);
-    jl_ptls_t ptls2 = jl_all_tls_states[tid];
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
     if (!jl_get_safe_restore()) {
         assert(exception);
         ptls2->bt_size =
@@ -265,8 +266,9 @@ kern_return_t catch_mach_exception_raise(
 #endif
     int16_t tid;
     jl_ptls_t ptls2 = NULL;
-    for (tid = 0; tid < jl_n_threads; tid++) {
-        jl_ptls_t _ptls2 = jl_all_tls_states[tid];
+    int nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    for (tid = 0; tid < nthreads; tid++) {
+        jl_ptls_t _ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
         if (pthread_mach_thread_np(_ptls2->system_id) == thread) {
             ptls2 = _ptls2;
             break;
@@ -381,9 +383,15 @@ static void attach_exception_port(thread_port_t thread, int segv_only)
     HANDLE_MACH_ERROR("thread_set_exception_ports", ret);
 }
 
-static void jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
+static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
 {
-    jl_ptls_t ptls2 = jl_all_tls_states[tid];
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+    if (ptls2 == NULL) // this thread is not alive
+        return 0;
+    jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
+    if (ct2 == NULL) // this thread is already dead
+        return 0;
+
     mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
 
     kern_return_t ret = thread_suspend(thread);
@@ -395,18 +403,22 @@ static void jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
 
     // Get the state of the suspended thread
     ret = thread_get_state(thread, MACH_THREAD_STATE, (thread_state_t)ctx, &count);
+    return 1;
 }
 
 static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
 {
     static host_thread_state_t state;
-    jl_thread_suspend_and_get_state2(tid, &state);
+    if (!jl_thread_suspend_and_get_state2(tid, &state)) {
+        *ctx = NULL;
+        return;
+    }
     *ctx = (unw_context_t*)&state;
 }
 
 static void jl_thread_resume(int tid, int sig)
 {
-    jl_ptls_t ptls2 = jl_all_tls_states[tid];
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
     mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
     kern_return_t ret = thread_resume(thread);
     HANDLE_MACH_ERROR("thread_resume", ret);
@@ -416,7 +428,7 @@ static void jl_thread_resume(int tid, int sig)
 // or if SIGINT happens too often.
 static void jl_try_deliver_sigint(void)
 {
-    jl_ptls_t ptls2 = jl_all_tls_states[0];
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
     mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
 
     kern_return_t ret = thread_suspend(thread);
@@ -452,11 +464,12 @@ CFI_NORETURN
 
 static void jl_exit_thread0(int exitstate, jl_bt_element_t *bt_data, size_t bt_size)
 {
-    jl_ptls_t ptls2 = jl_all_tls_states[0];
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
     mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
 
     host_thread_state_t state;
-    jl_thread_suspend_and_get_state2(0, &state);
+    if (!jl_thread_suspend_and_get_state2(0, &state))
+        return;
     unw_context_t *uc = (unw_context_t*)&state;
 
     // This aborts `sleep` and other syscalls.
@@ -608,8 +621,9 @@ void *mach_profile_listener(void *arg)
         // (so that thread zero gets notified last)
         int keymgr_locked = jl_lock_profile_mach(0);
 
-        int *randperm = profile_get_randperm(jl_n_threads);
-        for (int idx = jl_n_threads; idx-- > 0; ) {
+        int nthreads = jl_atomic_load_acquire(&jl_n_threads);
+        int *randperm = profile_get_randperm(nthreads);
+        for (int idx = nthreads; idx-- > 0; ) {
             // Stop the threads in the random or reverse round-robin order.
             int i = randperm[idx];
             // if there is no space left, break early
@@ -621,7 +635,8 @@ void *mach_profile_listener(void *arg)
             if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
                 _dyld_atfork_prepare(); // briefly acquire the dlsym lock
             host_thread_state_t state;
-            jl_thread_suspend_and_get_state2(i, &state);
+            if (!jl_thread_suspend_and_get_state2(i, &state))
+                continue;
             unw_context_t *uc = (unw_context_t*)&state;
             if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
                 _dyld_atfork_parent(); // quickly release the dlsym lock
@@ -660,12 +675,12 @@ void *mach_profile_listener(void *arg)
 #else
                 bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
 #endif
-                jl_ptls_t ptls = jl_all_tls_states[i];
+                jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[i];
 
                 // store threadid but add 1 as 0 is preserved to indicate end of block
                 bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
 
-                // store task id
+                // store task id (never null)
                 bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
 
                 // store cpu cycle clock
diff --git a/src/signals-unix.c b/src/signals-unix.c
index dadbd15de0832..1716bb21883af 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -372,7 +372,14 @@ static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
     clock_gettime(CLOCK_REALTIME, &ts);
     ts.tv_sec += 1;
     pthread_mutex_lock(&in_signal_lock);
-    jl_ptls_t ptls2 = jl_all_tls_states[tid];
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+    jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
+    if (ct2 == NULL) {
+        // this thread is not alive or already dead
+        *ctx = NULL;
+        pthread_mutex_unlock(&in_signal_lock);
+        return;
+    }
     jl_atomic_store_release(&ptls2->signal_request, 1);
     pthread_kill(ptls2->system_id, SIGUSR2);
     // wait for thread to acknowledge
@@ -404,7 +411,7 @@ static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
 
 static void jl_thread_resume(int tid, int sig)
 {
-    jl_ptls_t ptls2 = jl_all_tls_states[tid];
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
     jl_atomic_store_release(&ptls2->signal_request, sig == -1 ? 3 : 1);
     pthread_cond_broadcast(&exit_signal_cond);
     pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge
@@ -420,7 +427,7 @@ static void jl_thread_resume(int tid, int sig)
 // or if SIGINT happens too often.
 static void jl_try_deliver_sigint(void)
 {
-    jl_ptls_t ptls2 = jl_all_tls_states[0];
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
     jl_safepoint_enable_sigint();
     jl_wake_libuv();
     jl_atomic_store_release(&ptls2->signal_request, 2);
@@ -451,7 +458,7 @@ CFI_NORETURN
 
 static void jl_exit_thread0(int state, jl_bt_element_t *bt_data, size_t bt_size)
 {
-    jl_ptls_t ptls2 = jl_all_tls_states[0];
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
     if (thread0_exit_count <= 1) {
         unw_context_t *signal_context;
         jl_thread_suspend_and_get_state(0, &signal_context);
@@ -701,7 +708,7 @@ void trigger_profile_peek(void)
     if (bt_size_max == 0){
         // If the buffer hasn't been initialized, initialize with default size
         // Keep these values synchronized with Profile.default_init()
-        if (jl_profile_init(10000000 * jl_n_threads, 1000000) == -1){
+        if (jl_profile_init(10000000, 1000000) == -1) {
             jl_safe_printf("ERROR: could not initialize the profile buffer");
             return;
         }
@@ -831,6 +838,7 @@ static void *signal_listener(void *arg)
         }
 #endif
 
+        int nthreads = jl_atomic_load_acquire(&jl_n_threads);
         bt_size = 0;
 #if !defined(JL_DISABLE_LIBUNWIND)
         unw_context_t *signal_context;
@@ -840,8 +848,8 @@ static void *signal_listener(void *arg)
             jl_lock_profile();
             int *randperm;
             if (profile)
-                 randperm = profile_get_randperm(jl_n_threads);
-            for (int idx = jl_n_threads; idx-- > 0; ) {
+                 randperm = profile_get_randperm(nthreads);
+            for (int idx = nthreads; idx-- > 0; ) {
                 // Stop the threads in the random or reverse round-robin order.
                 int i = profile ? randperm[idx] : idx;
                 // notify thread to stop
@@ -853,7 +861,7 @@ static void *signal_listener(void *arg)
                 // this part must be signal-handler safe
                 if (critical) {
                     bt_size += rec_backtrace_ctx(bt_data + bt_size,
-                            JL_MAX_BT_SIZE / jl_n_threads - 1,
+                            JL_MAX_BT_SIZE / nthreads - 1,
                             signal_context, NULL);
                     bt_data[bt_size++].uintptr = 0;
                 }
@@ -880,12 +888,12 @@ static void *signal_listener(void *arg)
                         }
                         jl_set_safe_restore(old_buf);
 
-                        jl_ptls_t ptls2 = jl_all_tls_states[i];
+                        jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[i];
 
                         // store threadid but add 1 as 0 is preserved to indicate end of block
                         bt_data_prof[bt_size_cur++].uintptr = ptls2->tid + 1;
 
-                        // store task id
+                        // store task id (never null)
                         bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task);
 
                         // store cpu cycle clock
@@ -927,11 +935,11 @@ static void *signal_listener(void *arg)
             else {
 #ifndef SIGINFO // SIGINFO already prints this automatically
                 int nrunning = 0;
-                for (int idx = jl_n_threads; idx-- > 0; ) {
-                    jl_ptls_t ptls2 = jl_all_tls_states[idx];
+                for (int idx = nthreads; idx-- > 0; ) {
+                    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[idx];
                     nrunning += !jl_atomic_load_relaxed(&ptls2->sleep_check_state);
                 }
-                jl_safe_printf("\ncmd: %s %d running %d of %d\n", jl_options.julia_bin ? jl_options.julia_bin : "julia", uv_os_getpid(), nrunning, jl_n_threads);
+                jl_safe_printf("\ncmd: %s %d running %d of %d\n", jl_options.julia_bin ? jl_options.julia_bin : "julia", uv_os_getpid(), nrunning, nthreads);
 #endif
 
                 jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig));
diff --git a/src/signals-win.c b/src/signals-win.c
index 178a7463b8d50..83e92ff400e1d 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -165,7 +165,7 @@ HANDLE hMainThread = INVALID_HANDLE_VALUE;
 // Try to throw the exception in the master thread.
 static void jl_try_deliver_sigint(void)
 {
-    jl_ptls_t ptls2 = jl_all_tls_states[0];
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
     jl_lock_profile();
     jl_safepoint_enable_sigint();
     jl_wake_libuv();
@@ -362,12 +362,12 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
                     bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
                             bt_size_max - bt_size_cur - 1, &ctxThread, NULL);
 
-                    jl_ptls_t ptls = jl_all_tls_states[0]; // given only profiling hMainThread
+                    jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread
 
                     // store threadid but add 1 as 0 is preserved to indicate end of block
                     bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
 
-                    // store task id
+                    // store task id (never null)
                     bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
 
                     // store cpu cycle clock
diff --git a/src/task.c b/src/task.c
index 22a5ad214e0b8..427c16b2481ab 100644
--- a/src/task.c
+++ b/src/task.c
@@ -265,7 +265,8 @@ JL_DLLEXPORT void *jl_task_stack_buffer(jl_task_t *task, size_t *size, int *ptid
 {
     size_t off = 0;
 #ifndef _OS_WINDOWS_
-    if (jl_all_tls_states[0]->root_task == task) {
+    jl_ptls_t ptls0 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
+    if (ptls0->root_task == task) {
         // See jl_init_root_task(). The root task of the main thread
         // has its buffer enlarged by an artificial 3000000 bytes, but
         // that means that the start of the buffer usually points to
@@ -306,7 +307,8 @@ JL_DLLEXPORT void jl_active_task_stack(jl_task_t *task,
     else if (task->stkbuf) {
         *total_start = *active_start = (char*)task->stkbuf;
 #ifndef _OS_WINDOWS_
-        if (jl_all_tls_states[0]->root_task == task) {
+        jl_ptls_t ptls0 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
+        if (ptls0->root_task == task) {
             // See jl_init_root_task(). The root task of the main thread
             // has its buffer enlarged by an artificial 3000000 bytes, but
             // that means that the start of the buffer usually points to
diff --git a/src/threading.c b/src/threading.c
index 2cebdb22fc0aa..581032092168c 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -46,12 +46,16 @@ JL_DLLEXPORT void *jl_get_ptls_states(void)
     return jl_current_task->ptls;
 }
 
+static void jl_delete_thread(void*);
+
 #if !defined(_OS_WINDOWS_)
+static pthread_key_t jl_task_exit_key;
 static pthread_key_t jl_safe_restore_key;
 
 __attribute__((constructor)) void _jl_init_safe_restore(void)
 {
     pthread_key_create(&jl_safe_restore_key, NULL);
+    pthread_key_create(&jl_task_exit_key, jl_delete_thread);
 }
 
 JL_DLLEXPORT jl_jmp_buf *jl_get_safe_restore(void)
@@ -124,21 +128,26 @@ static DWORD jl_safe_restore_key;
 BOOLEAN WINAPI DllMain(IN HINSTANCE hDllHandle, IN DWORD nReason,
                        IN LPVOID Reserved)
 {
+    jl_task_t *ct;
     switch (nReason) {
     case DLL_PROCESS_ATTACH:
         jl_pgcstack_key = TlsAlloc();
         assert(jl_pgcstack_key != TLS_OUT_OF_INDEXES);
         jl_safe_restore_key = TlsAlloc();
         assert(jl_safe_restore_key != TLS_OUT_OF_INDEXES);
-        // Fall through
-    case DLL_THREAD_ATTACH:
-        break;
-    case DLL_THREAD_DETACH:
         break;
     case DLL_PROCESS_DETACH:
         TlsFree(jl_pgcstack_key);
         TlsFree(jl_safe_restore_key);
         break;
+    case DLL_THREAD_ATTACH:
+        // will call jl_adopt_thread lazily on-demand
+        break;
+    case DLL_THREAD_DETACH:
+        ct = jl_get_current_task();
+        if (ct != NULL)
+            jl_delete_thread((void*)ct->ptls);
+        break;
     }
     return 1; // success
 }
@@ -291,7 +300,8 @@ void jl_pgcstack_getkey(jl_get_pgcstack_func **f, jl_pgcstack_key_t *k)
 #endif
 
 static uv_mutex_t tls_lock; // controls write-access to these variables:
-jl_ptls_t *jl_all_tls_states JL_GLOBALLY_ROOTED;
+_Atomic(jl_ptls_t*) jl_all_tls_states JL_GLOBALLY_ROOTED;
+int jl_all_tls_states_size;
 static uv_cond_t cond;
 
 // return calling thread's ID
@@ -302,7 +312,8 @@ JL_DLLEXPORT int16_t jl_threadid(void)
 
 JL_DLLEXPORT int8_t jl_threadpoolid(int16_t tid) JL_NOTSAFEPOINT
 {
-    if (tid < 0 || tid >= jl_n_threads)
+    int nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    if (tid < 0 || tid >= nthreads)
         jl_error("invalid tid");
     int n = 0;
     for (int i = 0; i < jl_n_threadpools; i++) {
@@ -310,14 +321,25 @@ JL_DLLEXPORT int8_t jl_threadpoolid(int16_t tid) JL_NOTSAFEPOINT
         if (tid < n)
             return (int8_t)i;
     }
-    jl_error("internal error: couldn't determine threadpool id");
+    return 0; // everything else uses threadpool 0 (though does not become part of any threadpool)
 }
 
 jl_ptls_t jl_init_threadtls(int16_t tid)
 {
+#ifndef _OS_WINDOWS_
+    if (pthread_getspecific(jl_task_exit_key))
+        abort();
+#endif
+    if (jl_get_pgcstack() != NULL)
+        abort();
     jl_ptls_t ptls = (jl_ptls_t)calloc(1, sizeof(jl_tls_states_t));
+#ifndef _OS_WINDOWS_
+    pthread_setspecific(jl_task_exit_key, (void*)ptls);
+#endif
     ptls->system_id = (jl_thread_t)(uintptr_t)uv_thread_self();
     ptls->rngseed = jl_rand();
+    if (tid == 0)
+        ptls->disable_gc = 1;
 #ifdef _OS_WINDOWS_
     if (tid == 0) {
         if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),
@@ -328,7 +350,6 @@ jl_ptls_t jl_init_threadtls(int16_t tid)
         }
     }
 #endif
-    ptls->tid = tid;
     jl_atomic_store_relaxed(&ptls->gc_state, 0); // GC unsafe
     // Conditionally initialize the safepoint address. See comment in
     // `safepoint.c`
@@ -349,11 +370,80 @@ jl_ptls_t jl_init_threadtls(int16_t tid)
     uv_mutex_init(&ptls->sleep_lock);
     uv_cond_init(&ptls->wake_signal);
 
-    jl_all_tls_states[tid] = ptls;
+    uv_mutex_lock(&tls_lock);
+    jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
+    if (tid == -1)
+        tid = jl_atomic_load_relaxed(&jl_n_threads);
+    ptls->tid = tid;
+    if (jl_all_tls_states_size <= tid) {
+        int i, newsize = jl_all_tls_states_size + tid + 2;
+        jl_ptls_t *newpptls = (jl_ptls_t*)calloc(newsize, sizeof(jl_ptls_t));
+        for (i = 0; i < jl_all_tls_states_size; i++) {
+            newpptls[i] = allstates[i];
+        }
+        jl_atomic_store_release(&jl_all_tls_states, newpptls);
+        jl_all_tls_states_size = newsize;
+        jl_gc_add_quiescent(ptls, (void**)allstates, free);
+        allstates = newpptls;
+    }
+    allstates[tid] = ptls;
+    if (jl_atomic_load_relaxed(&jl_n_threads) < tid + 1)
+        jl_atomic_store_release(&jl_n_threads, tid + 1);
+    jl_fence();
+    uv_mutex_unlock(&tls_lock);
 
     return ptls;
 }
 
+JL_DLLEXPORT jl_gcframe_t **jl_adopt_thread(void)
+{
+    // initialize this thread (assign tid, create heap, set up root task)
+    jl_ptls_t ptls = jl_init_threadtls(-1);
+    void *stack_lo, *stack_hi;
+    jl_init_stack_limits(0, &stack_lo, &stack_hi);
+
+    (void)jl_gc_unsafe_enter(ptls);
+    // warning: this changes `jl_current_task`, so be careful not to call that from this function
+    jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi);
+    JL_GC_PROMISE_ROOTED(ct);
+
+    return &ct->gcstack;
+}
+
+static void jl_delete_thread(void *value)
+{
+    jl_ptls_t ptls = (jl_ptls_t)value;
+    // Acquire the profile write lock, to ensure we are not racing with the `kill`
+    // call in the profile code which will also try to look at these variables.
+    // We have no control over when the user calls pthread_join, so we must do
+    // this here by blocking. This also synchronizes our read of `current_task`
+    // (which is the flag we currently use to check the liveness state of a thread).
+#ifdef _OS_WINDOWS_
+    jl_lock_profile_wr();
+#elif defined(JL_DISABLE_LIBUNWIND)
+    // nothing
+#elif defined(__APPLE__)
+    jl_lock_profile_wr();
+#else
+    pthread_mutex_lock(&in_signal_lock);
+#endif
+#ifndef _OS_WINDOWS_
+    pthread_setspecific(jl_task_exit_key, NULL);
+#endif
+    jl_atomic_store_relaxed(&ptls->current_task, NULL); // dead
+    jl_atomic_store_relaxed(&ptls->sleep_check_state, 2); // dead, interpreted as sleeping and unwakeable
+#ifdef _OS_WINDOWS_
+    jl_unlock_profile_wr();
+#elif defined(JL_DISABLE_LIBUNWIND)
+    // nothing
+#elif defined(__APPLE__)
+    jl_unlock_profile_wr();
+#else
+    pthread_mutex_unlock(&in_signal_lock);
+#endif
+    (void)jl_gc_safe_enter(ptls);
+}
+
 JL_DLLEXPORT jl_mutex_t jl_codegen_lock;
 jl_mutex_t typecache_lock;
 
@@ -467,7 +557,6 @@ void jl_init_threading(void)
 
     uv_mutex_init(&tls_lock);
     uv_cond_init(&cond);
-
 #ifdef JL_ELF_TLS_VARIANT
     jl_check_tls();
 #endif
@@ -477,8 +566,8 @@ void jl_init_threading(void)
     // environment variable. Set the globals `jl_n_threadpools`, `jl_n_threads`
     // and `jl_n_threads_per_pool`.
     jl_n_threadpools = 1;
-    jl_n_threads = JULIA_NUM_THREADS;
-    int16_t nthreads = jl_n_threads, nthreadsi = 0;
+    int16_t nthreads = JULIA_NUM_THREADS;
+    int16_t nthreadsi = 0;
     char *endptr, *endptri;
 
     if (jl_options.nthreads != 0) { // --threads specified
@@ -516,26 +605,26 @@ void jl_init_threading(void)
         }
     }
 
-    jl_n_threads = nthreads + nthreadsi;
-    jl_n_threads_per_pool = (int *)malloc(2 * sizeof(int));
+    jl_all_tls_states_size = nthreads + nthreadsi;
+    jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));
     jl_n_threads_per_pool[0] = nthreads;
     jl_n_threads_per_pool[1] = nthreadsi;
 
-#ifndef __clang_gcanalyzer__
-    jl_all_tls_states = (jl_ptls_t*)calloc(jl_n_threads, sizeof(void*));
-#endif
+    jl_atomic_store_release(&jl_all_tls_states, (jl_ptls_t*)calloc(jl_all_tls_states_size, sizeof(jl_ptls_t)));
+    jl_atomic_store_release(&jl_n_threads, jl_all_tls_states_size);
 }
 
 static uv_barrier_t thread_init_done;
 
 void jl_start_threads(void)
 {
+    int nthreads = jl_atomic_load_relaxed(&jl_n_threads);
     int cpumasksize = uv_cpumask_size();
     char *cp;
     int i, exclusive;
     uv_thread_t uvtid;
-    if (cpumasksize < jl_n_threads) // also handles error case
-        cpumasksize = jl_n_threads;
+    if (cpumasksize < nthreads) // also handles error case
+        cpumasksize = nthreads;
     char *mask = (char*)alloca(cpumasksize);
 
     // do we have exclusive use of the machine? default is no
@@ -548,7 +637,7 @@ void jl_start_threads(void)
     // according to a 'compact' policy
     // non-exclusive: no affinity settings; let the kernel move threads about
     if (exclusive) {
-        if (jl_n_threads > jl_cpu_threads()) {
+        if (nthreads > jl_cpu_threads()) {
             jl_printf(JL_STDERR, "ERROR: Too many threads requested for %s option.\n", MACHINE_EXCLUSIVE_NAME);
             exit(1);
         }
@@ -559,9 +648,6 @@ void jl_start_threads(void)
         mask[0] = 0;
     }
 
-    // The analyzer doesn't know jl_n_threads doesn't change, help it
-    size_t nthreads = jl_n_threads;
-
     // create threads
     uv_barrier_init(&thread_init_done, nthreads);
 
diff --git a/src/threading.h b/src/threading.h
index 4c6f1e19881f5..9fd63f0fd188d 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -12,7 +12,7 @@ extern "C" {
 
 #define PROFILE_JL_THREADING            0
 
-extern jl_ptls_t *jl_all_tls_states JL_GLOBALLY_ROOTED; /* thread local storage */
+extern _Atomic(jl_ptls_t*) jl_all_tls_states JL_GLOBALLY_ROOTED; /* thread local storage */
 
 typedef struct _jl_threadarg_t {
     int16_t tid;
diff --git a/stdlib/Distributed/test/distributed_exec.jl b/stdlib/Distributed/test/distributed_exec.jl
index 8ed55550e61b9..116f489677996 100644
--- a/stdlib/Distributed/test/distributed_exec.jl
+++ b/stdlib/Distributed/test/distributed_exec.jl
@@ -153,12 +153,12 @@ function _getenv_include_thread_unsafe()
 end
 const _env_include_thread_unsafe = _getenv_include_thread_unsafe()
 function include_thread_unsafe_tests()
-    if Threads.nthreads() > 1
+    if Threads.maxthreadid() > 1
         if _env_include_thread_unsafe
             return true
         end
-        msg = "Skipping a thread-unsafe test because `Threads.nthreads() > 1`"
-        @warn msg Threads.nthreads()
+        msg = "Skipping a thread-unsafe test because `Threads.maxthreadid() > 1`"
+        @warn msg Threads.maxthreadid()
         Test.@test_broken false
         return false
     end
diff --git a/stdlib/InteractiveUtils/src/InteractiveUtils.jl b/stdlib/InteractiveUtils/src/InteractiveUtils.jl
index 4621ed07ed124..8d0f23f5756ce 100644
--- a/stdlib/InteractiveUtils/src/InteractiveUtils.jl
+++ b/stdlib/InteractiveUtils/src/InteractiveUtils.jl
@@ -141,7 +141,7 @@ function versioninfo(io::IO=stdout; verbose::Bool=false)
     println(io, "  WORD_SIZE: ", Sys.WORD_SIZE)
     println(io, "  LIBM: ",Base.libm_name)
     println(io, "  LLVM: libLLVM-",Base.libllvm_version," (", Sys.JIT, ", ", Sys.CPU_NAME, ")")
-    println(io, "  Threads: ", Threads.nthreads(), " on ", Sys.CPU_THREADS, " virtual cores")
+    println(io, "  Threads: ", Threads.maxthreadid(), " on ", Sys.CPU_THREADS, " virtual cores")
 
     function is_nonverbose_env(k::String)
         return occursin(r"^JULIA_|^DYLD_|^LD_", k)
diff --git a/stdlib/LinearAlgebra/src/LinearAlgebra.jl b/stdlib/LinearAlgebra/src/LinearAlgebra.jl
index 030844a9e88e7..e4af012cc5f13 100644
--- a/stdlib/LinearAlgebra/src/LinearAlgebra.jl
+++ b/stdlib/LinearAlgebra/src/LinearAlgebra.jl
@@ -548,7 +548,8 @@ function versioninfo(io::IO=stdout)
         println(io, indent, "--> ", lib.libname, " (", interface, ")")
     end
     println(io, "Threading:")
-    println(io, indent, "Threads.nthreads() = ", Base.Threads.nthreads())
+    println(io, indent, "Threads.threadpoolsize() = ", Threads.threadpoolsize())
+    println(io, indent, "Threads.maxthreadid() = ", Base.Threads.maxthreadid())
     println(io, indent, "LinearAlgebra.BLAS.get_num_threads() = ", BLAS.get_num_threads())
     println(io, "Relevant environment variables:")
     env_var_names = [
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index f260be3ae4dfe..f7cdb2ca39b84 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -102,8 +102,7 @@ function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real}
     end
     delay_cur = ccall(:jl_profile_delay_nsec, UInt64, ())/10^9
     if n === nothing && delay === nothing
-        nthreads = Sys.iswindows() ? 1 : Threads.nthreads() # windows only profiles the main thread
-        return round(Int, n_cur / nthreads), delay_cur
+        return n_cur, delay_cur
     end
     nnew = (n === nothing) ? n_cur : n
     delaynew = (delay === nothing) ? delay_cur : delay
@@ -111,20 +110,17 @@ function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real}
 end
 
 function init(n::Integer, delay::Real; limitwarn::Bool = true)
-    nthreads = Sys.iswindows() ? 1 : Threads.nthreads() # windows only profiles the main thread
     sample_size_bytes = sizeof(Ptr) # == Sys.WORD_SIZE / 8
-    buffer_samples = n * nthreads
+    buffer_samples = n
     buffer_size_bytes = buffer_samples * sample_size_bytes
     if buffer_size_bytes > 2^29 && Sys.WORD_SIZE == 32
-        buffer_size_bytes_per_thread = floor(Int, 2^29 / nthreads)
-        buffer_samples_per_thread = floor(Int, buffer_size_bytes_per_thread / sample_size_bytes)
-        buffer_samples = buffer_samples_per_thread * nthreads
+        buffer_samples = floor(Int, 2^29 / sample_size_bytes)
         buffer_size_bytes = buffer_samples * sample_size_bytes
-        limitwarn && @warn "Requested profile buffer limited to 512MB (n = $buffer_samples_per_thread per thread) given that this system is 32-bit"
+        limitwarn && @warn "Requested profile buffer limited to 512MB (n = $buffer_samples) given that this system is 32-bit"
     end
-    status = ccall(:jl_profile_init, Cint, (Csize_t, UInt64), buffer_samples, round(UInt64,10^9*delay))
+    status = ccall(:jl_profile_init, Cint, (Csize_t, UInt64), buffer_samples, round(UInt64, 10^9*delay))
     if status == -1
-        error("could not allocate space for ", n, " instruction pointers per thread being profiled ($nthreads threads, $(Base.format_bytes(buffer_size_bytes)) total)")
+        error("could not allocate space for ", n, " instruction pointers ($(Base.format_bytes(buffer_size_bytes)))")
     end
 end
 
@@ -427,7 +423,7 @@ function getdict!(dict::LineInfoDict, data::Vector{UInt})
     n_unique_ips = length(unique_ips)
     n_unique_ips == 0 && return dict
     iplookups = similar(unique_ips, Vector{StackFrame})
-    @sync for indexes_part in Iterators.partition(eachindex(unique_ips), div(n_unique_ips, Threads.nthreads(), RoundUp))
+    @sync for indexes_part in Iterators.partition(eachindex(unique_ips), div(n_unique_ips, Threads.threadpoolsize(), RoundUp))
         Threads.@spawn begin
             for i in indexes_part
                 iplookups[i] = _lookup_corrected(unique_ips[i])
diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl
index 86c391d573e50..27b35dc9c035b 100644
--- a/stdlib/Profile/test/runtests.jl
+++ b/stdlib/Profile/test/runtests.jl
@@ -120,11 +120,10 @@ end
 @testset "setting sample count and delay in init" begin
     n_, delay_ = Profile.init()
     n_original = n_
-    nthreads = Sys.iswindows() ? 1 : Threads.nthreads()
     sample_size_bytes = sizeof(Ptr)
     def_n = Sys.iswindows() && Sys.WORD_SIZE == 32 ? 1_000_000 : 10_000_000
-    if Sys.WORD_SIZE == 32 && (def_n * nthreads * sample_size_bytes) > 2^29
-        @test n_ * nthreads * sample_size_bytes <= 2^29
+    if Sys.WORD_SIZE == 32 && (def_n * sample_size_bytes) > 2^29
+        @test n_ * sample_size_bytes <= 2^29
     else
         @test n_ == def_n
     end
@@ -133,8 +132,8 @@ end
     @test delay_ == def_delay
     Profile.init(n=1_000_001, delay=0.0005)
     n_, delay_ = Profile.init()
-    if Sys.WORD_SIZE == 32 && (1_000_001 * nthreads * sample_size_bytes) > 2^29
-        @test n_ * nthreads * sample_size_bytes <= 2^29
+    if Sys.WORD_SIZE == 32 && (1_000_001 * sample_size_bytes) > 2^29
+        @test n_ * sample_size_bytes <= 2^29
     else
         @test n_ == 1_000_001
     end
diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl
index cc76c6fcfb0c8..7063f1f87bf68 100644
--- a/test/cmdlineargs.jl
+++ b/test/cmdlineargs.jl
@@ -226,7 +226,7 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no`
     @test errors_not_signals(`$exename --cpu-target=invalidtarget`)
 
     # -t, --threads
-    code = "print(Threads.nthreads())"
+    code = "print(Threads.threadpoolsize())"
     cpu_threads = ccall(:jl_effective_threads, Int32, ())
     @test string(cpu_threads) ==
           read(`$exename --threads auto -e $code`, String) ==
@@ -254,7 +254,7 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no`
 
     # Combining --threads and --procs: --threads does propagate
     withenv("JULIA_NUM_THREADS" => nothing) do
-        code = "print(sum(remotecall_fetch(Threads.nthreads, x) for x in procs()))"
+        code = "print(sum(remotecall_fetch(Threads.threadpoolsize, x) for x in procs()))"
         @test read(`$exename -p2 -t2 -e $code`, String) == "6"
     end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 4c9ac1cfd869c..3227804cf7b47 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -127,7 +127,7 @@ cd(@__DIR__) do
     println("""
         Running parallel tests with:
           nworkers() = $(nworkers())
-          nthreads() = $(Threads.nthreads())
+          nthreads() = $(Threads.threadpoolsize())
           Sys.CPU_THREADS = $(Sys.CPU_THREADS)
           Sys.total_memory() = $(Base.format_bytes(Sys.total_memory()))
           Sys.free_memory() = $(Base.format_bytes(Sys.free_memory()))
diff --git a/test/threads.jl b/test/threads.jl
index 09e802757062b..fb684b275e864 100644
--- a/test/threads.jl
+++ b/test/threads.jl
@@ -124,7 +124,7 @@ end
 
 function get_nthreads(options = ``; cpus = nothing)
     cmd = `$(Base.julia_cmd()) --startup-file=no $(options)`
-    cmd = `$cmd -e "print(Threads.nthreads())"`
+    cmd = `$cmd -e "print(Threads.threadpoolsize())"`
     cmd = addenv(cmd, "JULIA_EXCLUSIVE" => "0", "JULIA_NUM_THREADS" => "auto")
     if cpus !== nothing
         cmd = setcpuaffinity(cmd, cpus)
diff --git a/test/threads_exec.jl b/test/threads_exec.jl
index 4bce3ebd71b41..68ba9377cf955 100644
--- a/test/threads_exec.jl
+++ b/test/threads_exec.jl
@@ -2,7 +2,7 @@
 
 using Test
 using Base.Threads
-using Base.Threads: SpinLock
+using Base.Threads: SpinLock, threadpoolsize
 
 # for cfunction_closure
 include("testenv.jl")
@@ -27,9 +27,12 @@ end
 # (expected test duration is about 18-180 seconds)
 Timer(t -> killjob("KILLING BY THREAD TEST WATCHDOG\n"), 1200)
 
+@test Threads.threadid() == 1
+@test 1 <= threadpoolsize() <= Threads.maxthreadid()
+
 # basic lock check
-if nthreads() > 1
-    let lk = Base.Threads.SpinLock()
+if threadpoolsize() > 1
+    let lk = SpinLock()
         c1 = Base.Event()
         c2 = Base.Event()
         @test trylock(lk)
@@ -50,7 +53,7 @@ end
 
 # threading constructs
 
-let a = zeros(Int, 2 * nthreads())
+let a = zeros(Int, 2 * threadpoolsize())
     @threads for i = 1:length(a)
         @sync begin
             @async begin
@@ -70,7 +73,7 @@ end
 
 # parallel loop with parallel atomic addition
 function threaded_loop(a, r, x)
-    counter = Threads.Atomic{Int}(min(Threads.nthreads(), length(r)))
+    counter = Threads.Atomic{Int}(min(threadpoolsize(), length(r)))
     @threads for i in r
         # synchronize the start given that each partition is started sequentially,
         # meaning that without the wait, if the loop is too fast the iteration can happen in order
@@ -208,7 +211,7 @@ function threaded_gc_locked(::Type{LockT}) where LockT
 end
 
 threaded_gc_locked(SpinLock)
-threaded_gc_locked(Threads.ReentrantLock)
+threaded_gc_locked(ReentrantLock)
 
 # Issue 33159
 # Make sure that a Threads.Condition can't be used without being locked, on any thread.
@@ -423,7 +426,7 @@ end
 for T in intersect((Int32, Int64, Float32, Float64), Base.Threads.atomictypes)
     var = Atomic{T}()
     nloops = 1000
-    di = nthreads()
+    di = threadpoolsize()
     @threads for i in 1:di
         test_atomic_cas!(var, i:di:nloops)
     end
@@ -513,7 +516,7 @@ function test_thread_cfunction()
     @test cfs[1] == cf1
     @test cfs[2] == cf(fs[2])
     @test length(unique(cfs)) == 1000
-    ok = zeros(Int, nthreads())
+    ok = zeros(Int, threadpoolsize())
     @threads :static for i in 1:10000
         i = mod1(i, 1000)
         fi = fs[i]
@@ -529,14 +532,14 @@ if cfunction_closure
 end
 
 function test_thread_range()
-    a = zeros(Int, nthreads())
+    a = zeros(Int, threadpoolsize())
     @threads for i in 1:threadid()
         a[i] = 1
     end
     for i in 1:threadid()
         @test a[i] == 1
     end
-    for i in (threadid() + 1):nthreads()
+    for i in (threadid() + 1):threadpoolsize()
         @test a[i] == 0
     end
 end
@@ -576,17 +579,17 @@ test_nested_loops()
 
 function test_thread_too_few_iters()
     x = Atomic()
-    a = zeros(Int, nthreads()+2)
-    threaded_loop(a, 1:nthreads()-1, x)
-    found = zeros(Bool, nthreads()+2)
-    for i=1:nthreads()-1
+    a = zeros(Int, threadpoolsize()+2)
+    threaded_loop(a, 1:threadpoolsize()-1, x)
+    found = zeros(Bool, threadpoolsize()+2)
+    for i=1:threadpoolsize()-1
         found[a[i]] = true
     end
-    @test x[] == nthreads()-1
+    @test x[] == threadpoolsize()-1
     # Next test checks that all loop iterations ran,
     # and were unique (via pigeon-hole principle).
-    @test !(false in found[1:nthreads()-1])
-    @test !(true in found[nthreads():end])
+    @test !(false in found[1:threadpoolsize()-1])
+    @test !(true in found[threadpoolsize():end])
 end
 test_thread_too_few_iters()
 
@@ -728,10 +731,10 @@ function _atthreads_with_error(a, err)
     end
     a
 end
-@test_throws CompositeException _atthreads_with_error(zeros(nthreads()), true)
-let a = zeros(nthreads())
+@test_throws CompositeException _atthreads_with_error(zeros(threadpoolsize()), true)
+let a = zeros(threadpoolsize())
     _atthreads_with_error(a, false)
-    @test a == [1:nthreads();]
+    @test a == [1:threadpoolsize();]
 end
 
 # static schedule
@@ -742,11 +745,11 @@ function _atthreads_static_schedule(n)
     end
     return ids
 end
-@test _atthreads_static_schedule(nthreads()) == 1:nthreads()
+@test _atthreads_static_schedule(threadpoolsize()) == 1:threadpoolsize()
 @test _atthreads_static_schedule(1) == [1;]
 @test_throws(
     "`@threads :static` cannot be used concurrently or nested",
-    @threads(for i = 1:1; _atthreads_static_schedule(nthreads()); end),
+    @threads(for i = 1:1; _atthreads_static_schedule(threadpoolsize()); end),
 )
 
 # dynamic schedule
@@ -759,35 +762,35 @@ function _atthreads_dynamic_schedule(n)
     end
     return inc[], flags
 end
-@test _atthreads_dynamic_schedule(nthreads()) == (nthreads(), ones(nthreads()))
+@test _atthreads_dynamic_schedule(threadpoolsize()) == (threadpoolsize(), ones(threadpoolsize()))
 @test _atthreads_dynamic_schedule(1) == (1, ones(1))
 @test _atthreads_dynamic_schedule(10) == (10, ones(10))
-@test _atthreads_dynamic_schedule(nthreads() * 2) == (nthreads() * 2, ones(nthreads() * 2))
+@test _atthreads_dynamic_schedule(threadpoolsize() * 2) == (threadpoolsize() * 2, ones(threadpoolsize() * 2))
 
 # nested dynamic schedule
 function _atthreads_dynamic_dynamic_schedule()
     inc = Threads.Atomic{Int}(0)
-    Threads.@threads :dynamic for _ = 1:nthreads()
-        Threads.@threads :dynamic for _ = 1:nthreads()
+    Threads.@threads :dynamic for _ = 1:threadpoolsize()
+        Threads.@threads :dynamic for _ = 1:threadpoolsize()
             Threads.atomic_add!(inc, 1)
         end
     end
     return inc[]
 end
-@test _atthreads_dynamic_dynamic_schedule() == nthreads() * nthreads()
+@test _atthreads_dynamic_dynamic_schedule() == threadpoolsize() * threadpoolsize()
 
 function _atthreads_static_dynamic_schedule()
-    ids = zeros(Int, nthreads())
+    ids = zeros(Int, threadpoolsize())
     inc = Threads.Atomic{Int}(0)
-    Threads.@threads :static for i = 1:nthreads()
+    Threads.@threads :static for i = 1:threadpoolsize()
         ids[i] = Threads.threadid()
-        Threads.@threads :dynamic for _ = 1:nthreads()
+        Threads.@threads :dynamic for _ = 1:threadpoolsize()
             Threads.atomic_add!(inc, 1)
         end
     end
     return ids, inc[]
 end
-@test _atthreads_static_dynamic_schedule() == (1:nthreads(), nthreads() * nthreads())
+@test _atthreads_static_dynamic_schedule() == (1:threadpoolsize(), threadpoolsize() * threadpoolsize())
 
 # errors inside @threads :dynamic
 function _atthreads_dynamic_with_error(a)
@@ -796,7 +799,7 @@ function _atthreads_dynamic_with_error(a)
     end
     a
 end
-@test_throws "user error in the loop body" _atthreads_dynamic_with_error(zeros(nthreads()))
+@test_throws "user error in the loop body" _atthreads_dynamic_with_error(zeros(threadpoolsize()))
 
 try
     @macroexpand @threads(for i = 1:10, j = 1:10; end)
@@ -1025,7 +1028,7 @@ function check_sync_end_race()
                 nnotscheduled += y === :notscheduled
             end
             # Useful for tuning the test:
-            @debug "`check_sync_end_race` done" nthreads() ncompleted nnotscheduled nerror
+            @debug "`check_sync_end_race` done" threadpoolsize() ncompleted nnotscheduled nerror
         finally
             done[] = true
         end
@@ -1039,21 +1042,21 @@ end
 
 # issue #41546, thread-safe package loading
 @testset "package loading" begin
-    ch = Channel{Bool}(nthreads())
+    ch = Channel{Bool}(threadpoolsize())
     barrier = Base.Event()
     old_act_proj = Base.ACTIVE_PROJECT[]
     try
         pushfirst!(LOAD_PATH, "@")
         Base.ACTIVE_PROJECT[] = joinpath(@__DIR__, "TestPkg")
         @sync begin
-            for _ in 1:nthreads()
+            for _ in 1:threadpoolsize()
                 Threads.@spawn begin
                     put!(ch, true)
                     wait(barrier)
                     @eval using TestPkg
                 end
             end
-            for _ in 1:nthreads()
+            for _ in 1:threadpoolsize()
                 take!(ch)
             end
             notify(barrier)