diff --git a/base/deprecated.jl b/base/deprecated.jl index 83ea85cddff7a..ae80e0c082dea 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -327,4 +327,6 @@ function setproperty!(ci::CodeInfo, s::Symbol, v) return setfield!(ci, s, convert(fieldtype(CodeInfo, s), v)) end +@eval Threads nthreads() = threadpoolsize() + # END 1.9 deprecations diff --git a/base/partr.jl b/base/partr.jl index a4cfcb60fe520..c5bb6603d53af 100644 --- a/base/partr.jl +++ b/base/partr.jl @@ -2,7 +2,7 @@ module Partr -using ..Threads: SpinLock, nthreads, threadid +using ..Threads: SpinLock, maxthreadid, threadid # a task minheap mutable struct taskheap diff --git a/base/pcre.jl b/base/pcre.jl index 963b3ee4726a2..c46f969839f34 100644 --- a/base/pcre.jl +++ b/base/pcre.jl @@ -28,7 +28,7 @@ THREAD_MATCH_CONTEXTS::Vector{Ptr{Cvoid}} = [C_NULL] PCRE_COMPILE_LOCK = nothing _tid() = Int(ccall(:jl_threadid, Int16, ())) + 1 -_nth() = Int(unsafe_load(cglobal(:jl_n_threads, Cint))) +_mth() = Int(Core.Intrinsics.atomic_pointerref(cglobal(:jl_n_threads, Cint), :acquire)) function get_local_match_context() tid = _tid() @@ -40,7 +40,7 @@ function get_local_match_context() try ctxs = THREAD_MATCH_CONTEXTS if length(ctxs) < tid - global THREAD_MATCH_CONTEXTS = ctxs = copyto!(fill(C_NULL, _nth()), ctxs) + global THREAD_MATCH_CONTEXTS = ctxs = copyto!(fill(C_NULL, length(ctxs) + _mth()), ctxs) end finally unlock(l) diff --git a/base/task.jl b/base/task.jl index 5601fea70a112..0756836936306 100644 --- a/base/task.jl +++ b/base/task.jl @@ -733,7 +733,7 @@ function workqueue_for(tid::Int) @lock l begin qs = Workqueues if length(qs) < tid - nt = Threads.nthreads() + nt = Threads.maxthreadid() @assert tid <= nt global Workqueues = qs = copyto!(typeof(qs)(undef, length(qs) + nt - 1), qs) end @@ -746,7 +746,7 @@ end function enq_work(t::Task) (t._state === task_state_runnable && t.queue === nothing) || error("schedule: Task not runnable") - if t.sticky || Threads.nthreads() == 1 + if t.sticky || Threads.threadpoolsize() == 1 tid = Threads.threadid(t) if tid == 0 # Issue #41324 diff --git a/base/threadingconstructs.jl b/base/threadingconstructs.jl index 0852fafe192ec..b27ebfdf657da 100644 --- a/base/threadingconstructs.jl +++ b/base/threadingconstructs.jl @@ -11,20 +11,25 @@ ID `1`. """ threadid() = Int(ccall(:jl_threadid, Int16, ())+1) +# lower bound on the largest threadid() """ - Threads.nthreads([:default|:interactive]) -> Int + Threads.maxthreadid() -> Int -Get the number of threads (across all thread pools or within the specified -thread pool) available to Julia. The number of threads across all thread -pools is the inclusive upper bound on [`threadid()`](@ref). +Get a lower bound on the number of threads (across all thread pools or within the specified +thread pool) available to the Julia process >= [`threadid()`](@ref), with atomic-acquire semantics. +""" +maxthreadid() = Int(Core.Intrinsics.atomic_pointerref(cglobal(:jl_n_threads, Cint), :acquire)) -See also: `BLAS.get_num_threads` and `BLAS.set_num_threads` in the -[`LinearAlgebra`](@ref man-linalg) standard library, and `nprocs()` in the -[`Distributed`](@ref man-distributed) standard library. """ -function nthreads end + Threads.nthreads(:default | :interactive) -> Int -nthreads() = Int(unsafe_load(cglobal(:jl_n_threads, Cint))) +Get the current number of threads within the specified thread pool. The threads in default +have id numbers `1:nthreads(:default)`. + +See also `BLAS.get_num_threads` and `BLAS.set_num_threads` in the [`LinearAlgebra`](@ref +man-linalg) standard library, and `nprocs()` in the [`Distributed`](@ref man-distributed) +standard library and [`Threads.maxthreadid()`](@ref). +""" function nthreads(pool::Symbol) if pool === :default tpid = Int8(0) @@ -35,6 +40,7 @@ function nthreads(pool::Symbol) end return _nthreads_in_pool(tpid) end + function _nthreads_in_pool(tpid::Int8) p = unsafe_load(cglobal(:jl_n_threads_per_pool, Ptr{Cint})) return Int(unsafe_load(p, tpid + 1)) @@ -57,10 +63,21 @@ Returns the number of threadpools currently configured. """ nthreadpools() = Int(unsafe_load(cglobal(:jl_n_threadpools, Cint))) +""" + Threads.threadpoolsize() + +Get the number of threads available to the Julia default worker thread-pool. + +See also: `BLAS.get_num_threads` and `BLAS.set_num_threads` in the +[`LinearAlgebra`](@ref man-linalg) standard library, and `nprocs()` in the +[`Distributed`](@ref man-distributed) standard library. +""" +threadpoolsize() = Threads._nthreads_in_pool(Int8(0)) +#threadpoolsize() = max(Int(JLOptions().nthreads), 1) function threading_run(fun, static) ccall(:jl_enter_threaded_region, Cvoid, ()) - n = nthreads() + n = threadpoolsize() tasks = Vector{Task}(undef, n) for i = 1:n t = Task(() -> fun(i)) # pass in tid @@ -93,7 +110,7 @@ function _threadsfor(iter, lbody, schedule) tid = 1 len, rem = lenr, 0 else - len, rem = divrem(lenr, nthreads()) + len, rem = divrem(lenr, threadpoolsize()) end # not enough iterations for all the threads? if len == 0 @@ -185,7 +202,7 @@ assumption may be removed in the future. This scheduling option is merely a hint to the underlying execution mechanism. However, a few properties can be expected. The number of `Task`s used by `:dynamic` scheduler is bounded by a small constant multiple of the number of available worker threads -([`nthreads()`](@ref Threads.nthreads)). Each task processes contiguous regions of the +([`Threads.threadpoolsize()`](@ref)). Each task processes contiguous regions of the iteration space. Thus, `@threads :dynamic for x in xs; f(x); end` is typically more efficient than `@sync for x in xs; @spawn f(x); end` if `length(xs)` is significantly larger than the number of the worker threads and the run-time of `f(x)` is relatively @@ -222,7 +239,7 @@ julia> function busywait(seconds) julia> @time begin Threads.@spawn busywait(5) - Threads.@threads :static for i in 1:Threads.nthreads() + Threads.@threads :static for i in 1:Threads.threadpoolsize() busywait(1) end end @@ -230,7 +247,7 @@ julia> @time begin julia> @time begin Threads.@spawn busywait(5) - Threads.@threads :dynamic for i in 1:Threads.nthreads() + Threads.@threads :dynamic for i in 1:Threads.threadpoolsize() busywait(1) end end diff --git a/base/threads.jl b/base/threads.jl index 2b68c7104ee5e..2d388cc4b9f77 100644 --- a/base/threads.jl +++ b/base/threads.jl @@ -11,25 +11,4 @@ include("threadingconstructs.jl") include("atomics.jl") include("locks-mt.jl") - -""" - resize_nthreads!(A, copyvalue=A[1]) - -Resize the array `A` to length [`nthreads()`](@ref). Any new -elements that are allocated are initialized to `deepcopy(copyvalue)`, -where `copyvalue` defaults to `A[1]`. - -This is typically used to allocate per-thread variables, and -should be called in `__init__` if `A` is a global constant. -""" -function resize_nthreads!(A::AbstractVector, copyvalue=A[1]) - nthr = nthreads() - nold = length(A) - resize!(A, nthr) - for i = nold+1:nthr - A[i] = deepcopy(copyvalue) - end - return A -end - end diff --git a/base/threads_overloads.jl b/base/threads_overloads.jl index 376c1af94f441..7241d3182901d 100644 --- a/base/threads_overloads.jl +++ b/base/threads_overloads.jl @@ -3,7 +3,7 @@ """ Threads.foreach(f, channel::Channel; schedule::Threads.AbstractSchedule=Threads.FairSchedule(), - ntasks=Threads.nthreads()) + ntasks=Base.threadpoolsize()) Similar to `foreach(f, channel)`, but iteration over `channel` and calls to `f` are split across `ntasks` tasks spawned by `Threads.@spawn`. This function @@ -40,7 +40,7 @@ collect(d) = [1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256 """ function Threads.foreach(f, channel::Channel; schedule::Threads.AbstractSchedule=Threads.FairSchedule(), - ntasks=Threads.nthreads()) + ntasks=Threads.threadpoolsize()) apply = _apply_for_schedule(schedule) stop = Threads.Atomic{Bool}(false) @sync for _ in 1:ntasks diff --git a/cli/loader_exe.c b/cli/loader_exe.c index 07a0bddcd4b87..ee7d27a563e04 100644 --- a/cli/loader_exe.c +++ b/cli/loader_exe.c @@ -15,7 +15,7 @@ extern "C" { JULIA_DEFINE_FAST_TLS #ifdef _COMPILER_ASAN_ENABLED_ -JL_DLLEXPORT const char* __asan_default_options() +JL_DLLEXPORT const char* __asan_default_options(void) { return "allow_user_segv_handler=1:detect_leaks=0"; // FIXME: enable LSAN after fixing leaks & defining __lsan_default_suppressions(), diff --git a/contrib/generate_precompile.jl b/contrib/generate_precompile.jl index acd61be502465..264036f4fb3ae 100644 --- a/contrib/generate_precompile.jl +++ b/contrib/generate_precompile.jl @@ -1,7 +1,7 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -if Threads.nthreads() != 1 - @warn "Running this file with multiple Julia threads may lead to a build error" Threads.nthreads() +if Threads.maxthreadid() != 1 + @warn "Running this file with multiple Julia threads may lead to a build error" Base.maxthreadid() end if Base.isempty(Base.ARGS) || Base.ARGS[1] !== "0" @@ -340,7 +340,7 @@ function generate_precompile_statements() # wait for the next prompt-like to appear readuntil(output_copy, "\n") strbuf = "" - while true + while !eof(output_copy) strbuf *= String(readavailable(output_copy)) occursin(JULIA_PROMPT, strbuf) && break occursin(PKG_PROMPT, strbuf) && break diff --git a/doc/src/base/multi-threading.md b/doc/src/base/multi-threading.md index 293857c1c6c65..4932aef4cc938 100644 --- a/doc/src/base/multi-threading.md +++ b/doc/src/base/multi-threading.md @@ -5,9 +5,11 @@ Base.Threads.@threads Base.Threads.foreach Base.Threads.@spawn Base.Threads.threadid +Base.Threads.maxthreadid Base.Threads.nthreads Base.Threads.threadpool Base.Threads.nthreadpools +Base.Threads.threadpoolsize ``` See also [Multi-Threading](@ref man-multithreading). diff --git a/doc/src/manual/embedding.md b/doc/src/manual/embedding.md index 26904d9ccffcd..0430d8a7c1ffb 100644 --- a/doc/src/manual/embedding.md +++ b/doc/src/manual/embedding.md @@ -604,7 +604,7 @@ The second condition above implies that you can not safely call `jl_...()` funct void *func(void*) { // Wrong, jl_eval_string() called from thread that was not started by Julia - jl_eval_string("println(Threads.nthreads())"); + jl_eval_string("println(Threads.threadid())"); return NULL; } @@ -630,7 +630,7 @@ void *func(void*) // Okay, all jl_...() calls from the same thread, // even though it is not the main application thread jl_init(); - jl_eval_string("println(Threads.nthreads())"); + jl_eval_string("println(Threads.threadid())"); jl_atexit_hook(0); return NULL; } @@ -670,7 +670,7 @@ int main() jl_eval_string("func(i) = ccall(:c_func, Float64, (Int32,), i)"); // Call func() multiple times, using multiple threads to do so - jl_eval_string("println(Threads.nthreads())"); + jl_eval_string("println(Base.threadpoolsize())"); jl_eval_string("use(i) = println(\"[J $(Threads.threadid())] i = $(i) -> $(func(i))\")"); jl_eval_string("Threads.@threads for i in 1:5 use(i) end"); diff --git a/doc/src/manual/multi-threading.md b/doc/src/manual/multi-threading.md index b20d0e54f1087..a536d4a508c09 100644 --- a/doc/src/manual/multi-threading.md +++ b/doc/src/manual/multi-threading.md @@ -6,10 +6,10 @@ of Julia multi-threading features. ## Starting Julia with multiple threads By default, Julia starts up with a single thread of execution. This can be verified by using the -command [`Threads.nthreads()`](@ref): +command [`Threads.threadpoolsize()`](@ref): ```jldoctest -julia> Threads.nthreads() +julia> Threads.threadpoolsize() 1 ``` @@ -38,7 +38,7 @@ $ julia --threads 4 Let's verify there are 4 threads at our disposal. ```julia-repl -julia> Threads.nthreads() +julia> Threads.threadpoolsize() 4 ``` @@ -267,7 +267,7 @@ avoid the race: ```julia-repl julia> using Base.Threads -julia> nthreads() +julia> Threads.threadpoolsize() 4 julia> acc = Ref(0) diff --git a/src/ccall.cpp b/src/ccall.cpp index 88c80b333b027..274a50a8fac76 100644 --- a/src/ccall.cpp +++ b/src/ccall.cpp @@ -1540,7 +1540,8 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs) assert(lrt == getVoidTy(ctx.builder.getContext())); assert(!isVa && !llvmcall && nccallargs == 0); JL_GC_POP(); - emit_gc_safepoint(ctx); + ctx.builder.CreateCall(prepare_call(gcroot_flush_func)); + emit_gc_safepoint(ctx.builder, get_current_ptls(ctx), ctx.tbaa().tbaa_const); return ghostValue(ctx, jl_nothing_type); } else if (is_libjulia_func("jl_get_ptls_states")) { @@ -1643,7 +1644,8 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs) ctx.builder.SetInsertPoint(checkBB); ctx.builder.CreateLoad( getSizeTy(ctx.builder.getContext()), - ctx.builder.CreateConstInBoundsGEP1_32(getSizeTy(ctx.builder.getContext()), get_current_signal_page(ctx), -1), + ctx.builder.CreateConstInBoundsGEP1_32(getSizeTy(ctx.builder.getContext()), + get_current_signal_page_from_ptls(ctx.builder, get_current_ptls(ctx), ctx.tbaa().tbaa_const), -1), true); ctx.builder.CreateBr(contBB); ctx.f->getBasicBlockList().push_back(contBB); diff --git a/src/cgutils.cpp b/src/cgutils.cpp index 2f1803ef91051..7a1da252e1ca0 100644 --- a/src/cgutils.cpp +++ b/src/cgutils.cpp @@ -8,7 +8,6 @@ STATISTIC(EmittedPointerFromObjref, "Number of emitted pointer_from_objref calls"); STATISTIC(EmittedPointerBitcast, "Number of emitted pointer bitcasts"); -STATISTIC(EmittedNthPtrAddr, "Number of emitted nth pointer address instructions"); STATISTIC(EmittedTypeof, "Number of emitted typeof instructions"); STATISTIC(EmittedErrors, "Number of emitted errors"); STATISTIC(EmittedConditionalErrors, "Number of emitted conditional errors"); @@ -42,7 +41,6 @@ STATISTIC(EmittedCPointerChecks, "Number of C pointer checks emitted"); STATISTIC(EmittedAllocObjs, "Number of object allocations emitted"); STATISTIC(EmittedWriteBarriers, "Number of write barriers emitted"); STATISTIC(EmittedNewStructs, "Number of new structs emitted"); -STATISTIC(EmittedSignalFences, "Number of signal fences emitted"); STATISTIC(EmittedDeferSignal, "Number of deferred signals emitted"); static Value *track_pjlvalue(jl_codectx_t &ctx, Value *V) @@ -971,41 +969,20 @@ static void emit_memcpy(jl_codectx_t &ctx, Value *dst, MDNode *tbaa_dst, const j emit_memcpy_llvm(ctx, dst, tbaa_dst, data_pointer(ctx, src), src.tbaa, sz, align, is_volatile); } -static Value *emit_nthptr_addr(jl_codectx_t &ctx, Value *v, ssize_t n, bool gctracked = true) -{ - ++EmittedNthPtrAddr; - return ctx.builder.CreateInBoundsGEP( - ctx.types().T_prjlvalue, - emit_bitcast(ctx, maybe_decay_tracked(ctx, v), ctx.types().T_pprjlvalue), - ConstantInt::get(getSizeTy(ctx.builder.getContext()), n)); -} - -static Value *emit_nthptr_addr(jl_codectx_t &ctx, Value *v, Value *idx) +static LoadInst *emit_nthptr_recast(jl_codectx_t &ctx, Value *v, Value *idx, MDNode *tbaa, Type *type) { - ++EmittedNthPtrAddr; - return ctx.builder.CreateInBoundsGEP( + // p = (jl_value_t**)v; *(type*)&p[n] + Value *vptr = ctx.builder.CreateInBoundsGEP( ctx.types().T_prjlvalue, emit_bitcast(ctx, maybe_decay_tracked(ctx, v), ctx.types().T_pprjlvalue), idx); + LoadInst *load = ctx.builder.CreateLoad(type, emit_bitcast(ctx, vptr, PointerType::get(type, 0))); + tbaa_decorate(tbaa, load); + return load; } -static LoadInst *emit_nthptr_recast(jl_codectx_t &ctx, Value *v, Value *idx, MDNode *tbaa, Type *type) -{ - // p = (jl_value_t**)v; *(type*)&p[n] - Value *vptr = emit_nthptr_addr(ctx, v, idx); - return cast(tbaa_decorate(tbaa, ctx.builder.CreateLoad(type, - emit_bitcast(ctx, vptr, PointerType::get(type, 0))))); -} - -static LoadInst *emit_nthptr_recast(jl_codectx_t &ctx, Value *v, ssize_t n, MDNode *tbaa, Type *type) -{ - // p = (jl_value_t**)v; *(type*)&p[n] - Value *vptr = emit_nthptr_addr(ctx, v, n); - return cast(tbaa_decorate(tbaa, ctx.builder.CreateLoad(type, - emit_bitcast(ctx, vptr, PointerType::get(type, 0))))); - } - static Value *boxed(jl_codectx_t &ctx, const jl_cgval_t &v, bool is_promotable=false); + static Value *emit_typeof(jl_codectx_t &ctx, Value *v, bool maybenull); static jl_cgval_t emit_typeof(jl_codectx_t &ctx, const jl_cgval_t &p, bool maybenull) @@ -1177,8 +1154,12 @@ static Value *emit_datatype_isprimitivetype(jl_codectx_t &ctx, Value *dt) static Value *emit_datatype_name(jl_codectx_t &ctx, Value *dt) { - Value *vptr = emit_nthptr_addr(ctx, dt, (ssize_t)(offsetof(jl_datatype_t, name) / sizeof(char*))); - return tbaa_decorate(ctx.tbaa().tbaa_const, ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, vptr, Align(sizeof(void*)))); + unsigned n = offsetof(jl_datatype_t, name) / sizeof(char*); + Value *vptr = ctx.builder.CreateInBoundsGEP( + ctx.types().T_pjlvalue, + emit_bitcast(ctx, maybe_decay_tracked(ctx, dt), ctx.types().T_ppjlvalue), + ConstantInt::get(getSizeTy(ctx.builder.getContext()), n)); + return tbaa_decorate(ctx.tbaa().tbaa_const, ctx.builder.CreateAlignedLoad(ctx.types().T_pjlvalue, vptr, Align(sizeof(void*)))); } // --- generating various error checks --- @@ -1490,8 +1471,8 @@ static std::pair emit_isa(jl_codectx_t &ctx, const jl_cgval_t &x, // so the isa test reduces to a comparison of the typename by pointer return std::make_pair( ctx.builder.CreateICmpEQ( - mark_callee_rooted(ctx, emit_datatype_name(ctx, emit_typeof_boxed(ctx, x))), - mark_callee_rooted(ctx, literal_pointer_val(ctx, (jl_value_t*)dt->name))), + emit_datatype_name(ctx, emit_typeof_boxed(ctx, x)), + literal_pointer_val(ctx, (jl_value_t*)dt->name)), false); } if (jl_is_uniontype(intersected_type) && @@ -3427,10 +3408,10 @@ static void emit_cpointercheck(jl_codectx_t &ctx, const jl_cgval_t &x, const std emit_typecheck(ctx, mark_julia_type(ctx, t, true, jl_any_type), (jl_value_t*)jl_datatype_type, msg); Value *istype = - ctx.builder.CreateICmpEQ(mark_callee_rooted(ctx, emit_datatype_name(ctx, t)), - mark_callee_rooted(ctx, literal_pointer_val(ctx, (jl_value_t*)jl_pointer_typename))); - BasicBlock *failBB = BasicBlock::Create(ctx.builder.getContext(),"fail",ctx.f); - BasicBlock *passBB = BasicBlock::Create(ctx.builder.getContext(),"pass"); + ctx.builder.CreateICmpEQ(emit_datatype_name(ctx, t), + literal_pointer_val(ctx, (jl_value_t*)jl_pointer_typename)); + BasicBlock *failBB = BasicBlock::Create(ctx.builder.getContext(), "fail", ctx.f); + BasicBlock *passBB = BasicBlock::Create(ctx.builder.getContext(), "pass"); ctx.builder.CreateCondBr(istype, passBB, failBB); ctx.builder.SetInsertPoint(failBB); @@ -3878,8 +3859,7 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg static void emit_signal_fence(jl_codectx_t &ctx) { - ++EmittedSignalFences; - ctx.builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SyncScope::SingleThread); + emit_signal_fence(ctx.builder); } static Value *emit_defer_signal(jl_codectx_t &ctx) @@ -3892,69 +3872,6 @@ static Value *emit_defer_signal(jl_codectx_t &ctx) return ctx.builder.CreateInBoundsGEP(ctx.types().T_sigatomic, ptls, ArrayRef(offset), "jl_defer_signal"); } -static void emit_gc_safepoint(jl_codectx_t &ctx) -{ - ctx.builder.CreateCall(prepare_call(gcroot_flush_func)); - emit_signal_fence(ctx); - ctx.builder.CreateLoad(getSizeTy(ctx.builder.getContext()), get_current_signal_page(ctx), true); - emit_signal_fence(ctx); -} - -static Value *emit_gc_state_set(jl_codectx_t &ctx, Value *state, Value *old_state) -{ - Type *T_int8 = state->getType(); - Value *ptls = emit_bitcast(ctx, get_current_ptls(ctx), getInt8PtrTy(ctx.builder.getContext())); - Constant *offset = ConstantInt::getSigned(getInt32Ty(ctx.builder.getContext()), offsetof(jl_tls_states_t, gc_state)); - Value *gc_state = ctx.builder.CreateInBoundsGEP(T_int8, ptls, ArrayRef(offset), "gc_state"); - if (old_state == nullptr) { - old_state = ctx.builder.CreateLoad(T_int8, gc_state); - cast(old_state)->setOrdering(AtomicOrdering::Monotonic); - } - ctx.builder.CreateAlignedStore(state, gc_state, Align(sizeof(void*)))->setOrdering(AtomicOrdering::Release); - if (auto *C = dyn_cast(old_state)) - if (C->isZero()) - return old_state; - if (auto *C = dyn_cast(state)) - if (!C->isZero()) - return old_state; - BasicBlock *passBB = BasicBlock::Create(ctx.builder.getContext(), "safepoint", ctx.f); - BasicBlock *exitBB = BasicBlock::Create(ctx.builder.getContext(), "after_safepoint", ctx.f); - Constant *zero8 = ConstantInt::get(T_int8, 0); - ctx.builder.CreateCondBr(ctx.builder.CreateAnd(ctx.builder.CreateICmpNE(old_state, zero8), // if (old_state && !state) - ctx.builder.CreateICmpEQ(state, zero8)), - passBB, exitBB); - ctx.builder.SetInsertPoint(passBB); - emit_gc_safepoint(ctx); - ctx.builder.CreateBr(exitBB); - ctx.builder.SetInsertPoint(exitBB); - return old_state; -} - -static Value *emit_gc_unsafe_enter(jl_codectx_t &ctx) -{ - Value *state = ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0); - return emit_gc_state_set(ctx, state, nullptr); -} - -static Value *emit_gc_unsafe_leave(jl_codectx_t &ctx, Value *state) -{ - Value *old_state = ConstantInt::get(state->getType(), 0); - return emit_gc_state_set(ctx, state, old_state); -} - -//static Value *emit_gc_safe_enter(jl_codectx_t &ctx) -//{ -// Value *state = ConstantInt::get(getInt8Ty(ctx.builder.getContext()), JL_GC_STATE_SAFE); -// return emit_gc_state_set(ctx, state, nullptr); -//} -// -//static Value *emit_gc_safe_leave(jl_codectx_t &ctx, Value *state) -//{ -// Value *old_state = ConstantInt::get(state->getType(), JL_GC_STATE_SAFE); -// return emit_gc_state_set(ctx, state, old_state); -//} - - #ifndef JL_NDEBUG static int compare_cgparams(const jl_cgparams_t *a, const jl_cgparams_t *b) diff --git a/src/codegen.cpp b/src/codegen.cpp index 2f3974c3a5110..6f654e59d81b4 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -590,6 +590,11 @@ static const auto jlpgcstack_func = new JuliaFunction{ nullptr, }; +static const auto jladoptthread_func = new JuliaFunction{ + "julia.get_pgcstack_or_new", + jlpgcstack_func->_type, + jlpgcstack_func->_attrs, +}; // important functions @@ -1482,11 +1487,9 @@ static Value *global_binding_pointer(jl_codectx_t &ctx, jl_module_t *m, jl_sym_t static jl_cgval_t emit_checked_var(jl_codectx_t &ctx, Value *bp, jl_sym_t *name, bool isvol, MDNode *tbaa); static jl_cgval_t emit_sparam(jl_codectx_t &ctx, size_t i); static Value *emit_condition(jl_codectx_t &ctx, const jl_cgval_t &condV, const std::string &msg); -static void allocate_gc_frame(jl_codectx_t &ctx, BasicBlock *b0); static Value *get_current_task(jl_codectx_t &ctx); static Value *get_current_ptls(jl_codectx_t &ctx); static Value *get_last_age_field(jl_codectx_t &ctx); -static Value *get_current_signal_page(jl_codectx_t &ctx); static void CreateTrap(IRBuilder<> &irbuilder, bool create_new_block = true); static CallInst *emit_jlcall(jl_codectx_t &ctx, Function *theFptr, Value *theF, const jl_cgval_t *args, size_t nargs, JuliaFunction *trampoline); @@ -5327,21 +5330,17 @@ JL_GCC_IGNORE_STOP // --- generate function bodies --- // gc frame emission -static void allocate_gc_frame(jl_codectx_t &ctx, BasicBlock *b0) +static void allocate_gc_frame(jl_codectx_t &ctx, BasicBlock *b0, bool or_new=false) { // allocate a placeholder gc instruction // this will require the runtime, but it gets deleted later if unused - ctx.topalloca = ctx.builder.CreateCall(prepare_call(jlpgcstack_func)); + ctx.topalloca = ctx.builder.CreateCall(prepare_call(or_new ? jladoptthread_func : jlpgcstack_func)); ctx.pgcstack = ctx.topalloca; } static Value *get_current_task(jl_codectx_t &ctx) { - const int ptls_offset = offsetof(jl_task_t, gcstack); - return ctx.builder.CreateInBoundsGEP( - ctx.types().T_pjlvalue, emit_bitcast(ctx, ctx.pgcstack, ctx.types().T_ppjlvalue), - ConstantInt::get(getSizeTy(ctx.builder.getContext()), -(ptls_offset / sizeof(void *))), - "current_task"); + return get_current_task_from_pgcstack(ctx.builder, ctx.pgcstack); } // Get PTLS through current task. @@ -5361,15 +5360,6 @@ static Value *get_last_age_field(jl_codectx_t &ctx) "world_age"); } -// Get signal page through current task. -static Value *get_current_signal_page(jl_codectx_t &ctx) -{ - // return ctx.builder.CreateCall(prepare_call(reuse_signal_page_func)); - Value *ptls = get_current_ptls(ctx); - int nthfield = offsetof(jl_tls_states_t, safepoint) / sizeof(void *); - return emit_nthptr_recast(ctx, ptls, nthfield, ctx.tbaa().tbaa_const, getSizePtrTy(ctx.builder.getContext())); -} - static Function *emit_tojlinvoke(jl_code_instance_t *codeinst, Module *M, jl_codegen_params_t ¶ms) { ++EmittedToJLInvokes; @@ -5654,19 +5644,11 @@ static Function* gen_cfun_wrapper( ctx.builder.SetInsertPoint(b0); DebugLoc noDbg; ctx.builder.SetCurrentDebugLocation(noDbg); - allocate_gc_frame(ctx, b0); + allocate_gc_frame(ctx, b0, true); - Value *dummy_world = ctx.builder.CreateAlloca(getSizeTy(ctx.builder.getContext())); - Value *have_tls = ctx.builder.CreateIsNotNull(ctx.pgcstack); - // TODO: in the future, initialize a full TLS context here Value *world_age_field = get_last_age_field(ctx); - world_age_field = ctx.builder.CreateSelect(have_tls, world_age_field, dummy_world); Value *last_age = tbaa_decorate(ctx.tbaa().tbaa_gcframe, ctx.builder.CreateAlignedLoad(getSizeTy(ctx.builder.getContext()), world_age_field, Align(sizeof(size_t)))); - Value *last_gc_state = ConstantInt::get(getInt8Ty(ctx.builder.getContext()), JL_GC_STATE_SAFE); - last_gc_state = emit_guarded_test(ctx, have_tls, last_gc_state, [&] { - return emit_gc_unsafe_enter(ctx); - }); Value *world_v = ctx.builder.CreateAlignedLoad(getSizeTy(ctx.builder.getContext()), prepare_global_in(jl_Module, jlgetworld_global), Align(sizeof(size_t))); @@ -5681,12 +5663,7 @@ static Function* gen_cfun_wrapper( emit_bitcast(ctx, literal_pointer_val(ctx, (jl_value_t*)codeinst), getSizePtrTy(ctx.builder.getContext())), offsetof(jl_code_instance_t, max_world) / sizeof(size_t)), Align(sizeof(size_t))); - // XXX: age is always OK if we don't have a TLS. This is a hack required due to `@threadcall` abuse. - // and adds quite a bit of complexity here, even though it's still wrong - // (anything that tries to interact with the runtime will fault) age_ok = ctx.builder.CreateICmpUGE(lam_max, world_v); - world_v = ctx.builder.CreateSelect(ctx.builder.CreateOr(have_tls, age_ok), world_v, lam_max); - age_ok = ctx.builder.CreateOr(ctx.builder.CreateNot(have_tls), age_ok); } ctx.builder.CreateStore(world_v, world_age_field); @@ -6043,12 +6020,6 @@ static Function* gen_cfun_wrapper( } ctx.builder.CreateStore(last_age, world_age_field); - if (!sig.retboxed) { - emit_guarded_test(ctx, have_tls, nullptr, [&] { - emit_gc_unsafe_leave(ctx, last_gc_state); - return nullptr; - }); - } ctx.builder.CreateRet(r); ctx.builder.SetCurrentDebugLocation(noDbg); @@ -8490,6 +8461,7 @@ static void init_jit_functions(void) add_named_global(jl_write_barrier_func, (void*)NULL); add_named_global(jl_write_barrier_binding_func, (void*)NULL); add_named_global(jldlsym_func, &jl_load_and_lookup); + add_named_global("jl_adopt_thread", &jl_adopt_thread); add_named_global(jlgetcfunctiontrampoline_func, &jl_get_cfunction_trampoline); add_named_global(jlgetnthfieldchecked_func, &jl_get_nth_field_checked); add_named_global(diff_gc_total_bytes_func, &jl_gc_diff_total_bytes); diff --git a/src/codegen_shared.h b/src/codegen_shared.h index 0e68668378f4e..329cc567e8c5f 100644 --- a/src/codegen_shared.h +++ b/src/codegen_shared.h @@ -22,6 +22,7 @@ enum AddressSpace { }; static inline auto getSizeTy(llvm::LLVMContext &ctxt) { + //return M.getDataLayout().getIntPtrType(M.getContext()); if (sizeof(size_t) > sizeof(uint32_t)) { return llvm::Type::getInt64Ty(ctxt); } else { @@ -176,26 +177,127 @@ static inline llvm::Value *emit_bitcast_with_builder(llvm::IRBuilder<> &builder, } } +// Get PTLS through current task. +static inline llvm::Value *get_current_task_from_pgcstack(llvm::IRBuilder<> &builder, llvm::Value *pgcstack) +{ + using namespace llvm; + auto T_ppjlvalue = JuliaType::get_ppjlvalue_ty(builder.getContext()); + auto T_pjlvalue = JuliaType::get_pjlvalue_ty(builder.getContext()); + const int pgcstack_offset = offsetof(jl_task_t, gcstack); + return builder.CreateInBoundsGEP( + T_pjlvalue, emit_bitcast_with_builder(builder, pgcstack, T_ppjlvalue), + ConstantInt::get(getSizeTy(builder.getContext()), -(pgcstack_offset / sizeof(void *))), + "current_task"); +} + // Get PTLS through current task. static inline llvm::Value *get_current_ptls_from_task(llvm::IRBuilder<> &builder, llvm::Value *current_task, llvm::MDNode *tbaa) { using namespace llvm; auto T_ppjlvalue = JuliaType::get_ppjlvalue_ty(builder.getContext()); auto T_pjlvalue = JuliaType::get_pjlvalue_ty(builder.getContext()); - auto T_size = builder.GetInsertBlock()->getModule()->getDataLayout().getIntPtrType(builder.getContext()); + auto T_size = getSizeTy(builder.getContext()); const int ptls_offset = offsetof(jl_task_t, ptls); llvm::Value *pptls = builder.CreateInBoundsGEP( - T_pjlvalue, current_task, - ConstantInt::get(T_size, ptls_offset / sizeof(void *)), - "ptls_field"); + T_pjlvalue, current_task, + ConstantInt::get(T_size, ptls_offset / sizeof(void *)), + "ptls_field"); LoadInst *ptls_load = builder.CreateAlignedLoad(T_pjlvalue, - emit_bitcast_with_builder(builder, pptls, T_ppjlvalue), Align(sizeof(void *)), "ptls_load"); + emit_bitcast_with_builder(builder, pptls, T_ppjlvalue), Align(sizeof(void *)), "ptls_load"); // Note: Corresponding store (`t->ptls = ptls`) happens in `ctx_switch` of tasks.c. tbaa_decorate(tbaa, ptls_load); - // Using `CastInst::Create` to get an `Instruction*` without explicit cast: - auto ptls = CastInst::Create(Instruction::BitCast, ptls_load, T_ppjlvalue, "ptls"); - builder.Insert(ptls); - return ptls; + return builder.CreateBitCast(ptls_load, T_ppjlvalue, "ptls"); +} + +// Get signal page through current task. +static inline llvm::Value *get_current_signal_page_from_ptls(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::MDNode *tbaa) +{ + using namespace llvm; + // return builder.CreateCall(prepare_call(reuse_signal_page_func)); + auto T_size = getSizeTy(builder.getContext()); + auto T_psize = T_size->getPointerTo(); + auto T_ppsize = T_psize->getPointerTo(); + int nthfield = offsetof(jl_tls_states_t, safepoint) / sizeof(void *); + ptls = emit_bitcast_with_builder(builder, ptls, T_ppsize); + llvm::Value *psafepoint = builder.CreateInBoundsGEP( + T_psize, ptls, ConstantInt::get(T_size, nthfield)); + LoadInst *ptls_load = builder.CreateAlignedLoad( + T_psize, psafepoint, Align(sizeof(void *)), "safepoint"); + tbaa_decorate(tbaa, ptls_load); + return ptls_load; +} + +static inline void emit_signal_fence(llvm::IRBuilder<> &builder) +{ + using namespace llvm; + builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SyncScope::SingleThread); +} + +static inline void emit_gc_safepoint(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::MDNode *tbaa) +{ + emit_signal_fence(builder); + builder.CreateLoad(getSizeTy(builder.getContext()), get_current_signal_page_from_ptls(builder, ptls, tbaa), true); + emit_signal_fence(builder); +} + +static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state, llvm::Value *old_state) +{ + using namespace llvm; + Type *T_int8 = state->getType(); + ptls = emit_bitcast_with_builder(builder, ptls, builder.getInt8PtrTy()); + Constant *offset = ConstantInt::getSigned(builder.getInt32Ty(), offsetof(jl_tls_states_t, gc_state)); + Value *gc_state = builder.CreateInBoundsGEP(T_int8, ptls, ArrayRef(offset), "gc_state"); + if (old_state == nullptr) { + old_state = builder.CreateLoad(T_int8, gc_state); + cast(old_state)->setOrdering(AtomicOrdering::Monotonic); + } + builder.CreateAlignedStore(state, gc_state, Align(sizeof(void*)))->setOrdering(AtomicOrdering::Release); + if (auto *C = dyn_cast(old_state)) + if (C->isZero()) + return old_state; + if (auto *C = dyn_cast(state)) + if (!C->isZero()) + return old_state; + BasicBlock *passBB = BasicBlock::Create(builder.getContext(), "safepoint", builder.GetInsertBlock()->getParent()); + BasicBlock *exitBB = BasicBlock::Create(builder.getContext(), "after_safepoint", builder.GetInsertBlock()->getParent()); + Constant *zero8 = ConstantInt::get(T_int8, 0); + builder.CreateCondBr(builder.CreateAnd(builder.CreateICmpNE(old_state, zero8), // if (old_state && !state) + builder.CreateICmpEQ(state, zero8)), + passBB, exitBB); + builder.SetInsertPoint(passBB); + MDNode *tbaa = get_tbaa_const(builder.getContext()); + emit_gc_safepoint(builder, ptls, tbaa); + builder.CreateBr(exitBB); + builder.SetInsertPoint(exitBB); + return old_state; +} + +static inline llvm::Value *emit_gc_unsafe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls) +{ + using namespace llvm; + Value *state = builder.getInt8(0); + return emit_gc_state_set(builder, ptls, state, nullptr); +} + +static inline llvm::Value *emit_gc_unsafe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state) +{ + using namespace llvm; + Value *old_state = builder.getInt8(0); + return emit_gc_state_set(builder, ptls, state, old_state); +} + +static inline llvm::Value *emit_gc_safe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls) +{ + using namespace llvm; + Value *state = builder.getInt8(JL_GC_STATE_SAFE); + return emit_gc_state_set(builder, ptls, state, nullptr); +} + +static inline llvm::Value *emit_gc_safe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state) +{ + using namespace llvm; + Value *old_state = builder.getInt8(JL_GC_STATE_SAFE); + return emit_gc_state_set(builder, ptls, state, old_state); } // Compatibility shims for LLVM attribute APIs that were renamed in LLVM 14. @@ -327,5 +429,4 @@ inline Attribute getAttributeAtIndex(const AttributeList &L, unsigned Index, Att return L.getAttribute(Index, Kind); #endif } - } diff --git a/src/flisp/flmain.c b/src/flisp/flmain.c index f3861eed9e8a2..68e9d87a26d77 100644 --- a/src/flisp/flmain.c +++ b/src/flisp/flmain.c @@ -10,7 +10,7 @@ extern "C" { #if defined(__has_feature) #if __has_feature(address_sanitizer) -const char* __asan_default_options() { +const char* __asan_default_options(void) { return "detect_leaks=0"; } #endif diff --git a/src/gc-alloc-profiler.cpp b/src/gc-alloc-profiler.cpp index 818d6e803c9df..1bcbeb2189f5f 100644 --- a/src/gc-alloc-profiler.cpp +++ b/src/gc-alloc-profiler.cpp @@ -80,7 +80,8 @@ extern "C" { // Needed since these functions doesn't take any arguments. JL_DLLEXPORT void jl_start_alloc_profile(double sample_rate) { // We only need to do this once, the first time this is called. - while (g_alloc_profile.per_thread_profiles.size() < (size_t)jl_n_threads) { + size_t nthreads = jl_atomic_load_acquire(&jl_n_threads); + while (g_alloc_profile.per_thread_profiles.size() < nthreads) { g_alloc_profile.per_thread_profiles.push_back(jl_per_thread_alloc_profile_t{}); } @@ -131,7 +132,10 @@ JL_DLLEXPORT void jl_free_alloc_profile() { void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *type) JL_NOTSAFEPOINT { auto& global_profile = g_alloc_profile; - auto thread_id = jl_atomic_load_relaxed(&jl_current_task->tid); + size_t thread_id = jl_atomic_load_relaxed(&jl_current_task->tid); + if (thread_id >= global_profile.per_thread_profiles.size()) + return; // ignore allocations on threads started after the alloc-profile started + auto& profile = global_profile.per_thread_profiles[thread_id]; auto sample_val = double(rand()) / double(RAND_MAX); diff --git a/src/gc-debug.c b/src/gc-debug.c index 7d6ca8ece2ecf..dae634bd9b8fd 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -99,7 +99,7 @@ static arraylist_t bits_save[4]; static void gc_clear_mark_page(jl_gc_pagemeta_t *pg, int bits) { - jl_ptls_t ptls2 = jl_all_tls_states[pg->thread_n]; + jl_ptls_t ptls2 = gc_all_tls_states[pg->thread_n]; jl_gc_pool_t *pool = &ptls2->heap.norm_pools[pg->pool_n]; jl_taggedvalue_t *pv = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET); char *lim = (char*)pv + GC_PAGE_SZ - GC_PAGE_OFFSET - pool->osize; @@ -164,8 +164,8 @@ static void clear_mark(int bits) } } bigval_t *v; - for (int i = 0;i < jl_n_threads;i++) { - v = jl_all_tls_states[i]->heap.big_objects; + for (int i = 0; i < gc_n_threads; i++) { + v = gc_all_tls_states[i]->heap.big_objects; while (v != NULL) { void *gcv = &v->header; if (!gc_verifying) @@ -207,8 +207,8 @@ static void gc_verify_track(jl_ptls_t ptls) clear_mark(GC_CLEAN); gc_mark_queue_all_roots(ptls, &sp); gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - for (int i = 0;i < jl_n_threads;i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); } gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); @@ -256,8 +256,8 @@ void gc_verify(jl_ptls_t ptls) gc_verifying = 1; gc_mark_queue_all_roots(ptls, &sp); gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - for (int i = 0;i < jl_n_threads;i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); } gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); @@ -297,7 +297,7 @@ static void gc_verify_tags_page(jl_gc_pagemeta_t *pg) // for all pages in use int p_n = pg->pool_n; int t_n = pg->thread_n; - jl_ptls_t ptls2 = jl_all_tls_states[t_n]; + jl_ptls_t ptls2 = gc_all_tls_states[t_n]; jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n]; int osize = pg->osize; char *data = pg->data; @@ -401,8 +401,8 @@ static void gc_verify_tags_pagetable(void) void gc_verify_tags(void) { // verify the freelist chains look valid - for (int t_i = 0; t_i < jl_n_threads; t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; for (int i = 0; i < JL_GC_N_POOLS; i++) { // for all pools, iterate its freelist jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; @@ -467,7 +467,7 @@ static void gc_debug_alloc_init(jl_alloc_num_t *num, const char *name) return; if (*env == 'r') { env++; - for (int i = 0;i < 3;i++) { + for (int i = 0; i < 3; i++) { while (num->random[i] == 0) { num->random[i] = jl_rand(); } @@ -577,7 +577,7 @@ static void gc_scrub_task(jl_task_t *ta) jl_ptls_t ptls = jl_current_task->ptls; jl_ptls_t ptls2 = NULL; if (tid != -1) - ptls2 = jl_all_tls_states[tid]; + ptls2 = gc_all_tls_states[tid]; char *low; char *high; @@ -946,8 +946,8 @@ void gc_time_mark_pause(int64_t t0, int64_t scanned_bytes, { int64_t last_remset_len = 0; int64_t remset_nptr = 0; - for (int t_i = 0;t_i < jl_n_threads;t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; last_remset_len += ptls2->heap.last_remset->len; remset_nptr = ptls2->heap.remset_nptr; } @@ -1023,7 +1023,7 @@ void jl_gc_debug_init(void) #endif #ifdef OBJPROFILE - for (int g = 0;g < 3;g++) { + for (int g = 0; g < 3; g++) { htable_new(&obj_counts[g], 0); htable_new(&obj_sizes[g], 0); } @@ -1085,8 +1085,8 @@ void gc_stats_all_pool(void) { size_t nb=0, w, tw=0, no=0, tp=0, nold=0, noldbytes=0, np, nol; for (int i = 0; i < JL_GC_N_POOLS; i++) { - for (int t_i = 0; t_i < jl_n_threads; t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; size_t b = pool_stats(&ptls2->heap.norm_pools[i], &w, &np, &nol); nb += b; no += (b / ptls2->heap.norm_pools[i].osize); @@ -1110,8 +1110,8 @@ void gc_stats_all_pool(void) void gc_stats_big_obj(void) { size_t nused=0, nbytes=0, nused_old=0, nbytes_old=0; - for (int t_i = 0; t_i < jl_n_threads; t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; bigval_t *v = ptls2->heap.big_objects; while (v != NULL) { if (gc_marked(v->bits.gc)) { @@ -1219,7 +1219,7 @@ void gc_count_pool(void) empty_pages = 0; gc_count_pool_pagetable(); jl_safe_printf("****** Pool stat: ******\n"); - for (int i = 0;i < 4;i++) + for (int i = 0; i < 4; i++) jl_safe_printf("bits(%d): %" PRId64 "\n", i, poolobj_sizes[i]); // empty_pages is inaccurate after the sweep since young objects are // also GC_CLEAN diff --git a/src/gc-stacks.c b/src/gc-stacks.c index b7adf254026ca..32f5058c4e3ce 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -188,8 +188,9 @@ void sweep_stack_pools(void) // bufsz = t->bufsz // if (stkbuf) // push(free_stacks[sz], stkbuf) - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; // free half of stacks that remain unused since last sweep for (int p = 0; p < JL_N_STACK_POOLS; p++) { diff --git a/src/gc.c b/src/gc.c index c45ff8206ca67..9984b07435111 100644 --- a/src/gc.c +++ b/src/gc.c @@ -168,6 +168,8 @@ static _Atomic(int) support_conservative_marking = 0; jl_gc_num_t gc_num = {0}; static size_t last_long_collect_interval; +int gc_n_threads; +jl_ptls_t* gc_all_tls_states; pagetable_t memory_map; @@ -190,12 +192,15 @@ NOINLINE uintptr_t gc_get_stack_ptr(void) #define should_timeout() 0 -static void jl_gc_wait_for_the_world(void) +void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads) { - if (jl_n_threads > 1) + assert(gc_n_threads); + if (gc_n_threads > 1) jl_wake_libuv(); - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; // This acquire load pairs with the release stores // in the signal handler of safepoint so we are sure that // all the stores on those threads are visible. @@ -207,6 +212,9 @@ static void jl_gc_wait_for_the_world(void) } } + +void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads); + // malloc wrappers, aligned allocation #if defined(_OS_WINDOWS_) @@ -269,8 +277,8 @@ static void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT static void run_finalizer(jl_task_t *ct, jl_value_t *o, jl_value_t *ff) { - if (gc_ptr_tag(o, 1)) { - ((void (*)(void*))ff)(gc_ptr_clear_tag(o, 1)); + if (gc_ptr_tag(o, 3)) { + ((void (*)(void*))ff)(gc_ptr_clear_tag(o, 3)); return; } jl_value_t *args[2] = {ff,o}; @@ -411,7 +419,10 @@ static void run_finalizers(jl_task_t *ct) jl_rng_split(ct->rngState, finalizer_rngState); // This releases the finalizers lock. + int8_t was_in_finalizer = ct->ptls->in_finalizer; + ct->ptls->in_finalizer = 1; jl_gc_run_finalizers_in_list(ct, &copied_list); + ct->ptls->in_finalizer = was_in_finalizer; arraylist_free(&copied_list); memcpy(&ct->rngState[0], &save_rngState[0], sizeof(save_rngState)); @@ -423,9 +434,7 @@ JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct) ct = jl_current_task; jl_ptls_t ptls = ct->ptls; if (!ptls->in_finalizer && ptls->locks.len == 0 && ptls->finalizers_inhibited == 0) { - ptls->in_finalizer = 1; run_finalizers(ct); - ptls->in_finalizer = 0; } } @@ -496,13 +505,18 @@ static void schedule_all_finalizers(arraylist_t *flist) JL_NOTSAFEPOINT void jl_gc_run_all_finalizers(jl_task_t *ct) { + int gc_n_threads; + jl_ptls_t* gc_all_tls_states; + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); schedule_all_finalizers(&finalizer_list_marked); - // This could be run before we had a chance to setup all threads - for (int i = 0;i < jl_n_threads;i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; if (ptls2) schedule_all_finalizers(&ptls2->finalizers); } + gc_n_threads = 0; + gc_all_tls_states = NULL; run_finalizers(ct); } @@ -539,6 +553,13 @@ JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 1), f); } +// schedule f(v) to call at the next quiescent interval (aka after the next safepoint/region on all threads) +JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT +{ + assert(!gc_ptr_tag(v, 3)); + jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 3), f); +} + JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT { if (__unlikely(jl_typeis(f, jl_voidpointer_type))) { @@ -559,11 +580,18 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o) arraylist_new(&copied_list, 0); // No need to check the to_finalize list since the user is apparently // still holding a reference to the object - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; - finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i); + int gc_n_threads; + jl_ptls_t* gc_all_tls_states; + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2) + finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i); } finalize_object(&finalizer_list_marked, o, &copied_list, 0); + gc_n_threads = 0; + gc_all_tls_states = NULL; if (copied_list.len > 0) { // This releases the finalizers lock. jl_gc_run_finalizers_in_list(ct, &copied_list); @@ -595,9 +623,11 @@ static void gc_sweep_foreign_objs_in_list(arraylist_t *objs) static void gc_sweep_foreign_objs(void) { - for (int i = 0;i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; - gc_sweep_foreign_objs_in_list(&ptls2->sweep_objs); + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2) + gc_sweep_foreign_objs_in_list(&ptls2->sweep_objs); } } @@ -608,18 +638,19 @@ static int64_t last_gc_total_bytes = 0; // under this limit, but we will go above it rather than halting. #ifdef _P64 typedef uint64_t memsize_t; -#define default_collect_interval (5600*1024*sizeof(void*)) -static size_t max_collect_interval = 1250000000UL; -// Eventually we can expose this to the user/ci. -memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024; +static const size_t default_collect_interval = 5600 * 1024 * sizeof(void*); +static const size_t max_collect_interval = 1250000000UL; +static size_t total_mem; +// We expose this to the user/ci as jl_gc_set_max_memory +static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024; #else typedef uint32_t memsize_t; -#define default_collect_interval (3200*1024*sizeof(void*)) -static size_t max_collect_interval = 500000000UL; +static const size_t default_collect_interval = 3200 * 1024 * sizeof(void*); +static const size_t max_collect_interval = 500000000UL; // Work really hard to stay within 2GB // Alternative is to risk running out of address space // on 32 bit architectures. -memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024; +static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024; #endif // global variables for GC stats @@ -725,9 +756,11 @@ static void gc_sync_cache(jl_ptls_t ptls) JL_NOTSAFEPOINT // No other threads can be running marking at the same time static void gc_sync_all_caches_nolock(jl_ptls_t ptls) { - for (int t_i = 0; t_i < jl_n_threads; t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; - gc_sync_cache_nolock(ptls, &ptls2->gc_cache); + assert(gc_n_threads); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2) + gc_sync_cache_nolock(ptls, &ptls2->gc_cache); } } @@ -934,8 +967,11 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, static void clear_weak_refs(void) { - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; size_t n, l = ptls2->heap.weak_refs.len; void **lst = ptls2->heap.weak_refs.items; for (n = 0; n < l; n++) { @@ -948,8 +984,11 @@ static void clear_weak_refs(void) static void sweep_weak_refs(void) { - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; size_t n = 0; size_t ndel = 0; size_t l = ptls2->heap.weak_refs.len; @@ -1066,11 +1105,16 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT { gc_time_big_start(); - for (int i = 0;i < jl_n_threads;i++) - sweep_big_list(sweep_full, &jl_all_tls_states[i]->heap.big_objects); + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; + sweep_big_list(sweep_full, &ptls2->heap.big_objects); + } if (sweep_full) { bigval_t **last_next = sweep_big_list(sweep_full, &big_objects_marked); - // Move all survivors from big_objects_marked list to big_objects list. + // Move all survivors from big_objects_marked list to the big_objects list of this thread. if (ptls->heap.big_objects) ptls->heap.big_objects->prev = last_next; *last_next = ptls->heap.big_objects; @@ -1109,8 +1153,12 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT { - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls = jl_all_tls_states[i]; + int gc_n_threads; + jl_ptls_t* gc_all_tls_states; + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls = gc_all_tls_states[i]; if (ptls) { dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval); dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed); @@ -1125,8 +1173,12 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT { - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls = jl_all_tls_states[i]; + int gc_n_threads; + jl_ptls_t* gc_all_tls_states; + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls = gc_all_tls_states[i]; if (ptls) { memset(&ptls->gc_num, 0, sizeof(ptls->gc_num)); jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval); @@ -1173,8 +1225,11 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT { gc_time_mallocd_array_start(); - for (int t_i = 0;t_i < jl_n_threads;t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + assert(gc_n_threads); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) + continue; mallocarray_t *ma = ptls2->heap.mallocarrays; mallocarray_t **pma = &ptls2->heap.mallocarrays; while (ma != NULL) { @@ -1198,11 +1253,10 @@ static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT } // pool allocation -static inline jl_taggedvalue_t *reset_page(const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT +static inline jl_taggedvalue_t *reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT { assert(GC_PAGE_OFFSET >= sizeof(void*)); pg->nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / p->osize; - jl_ptls_t ptls2 = jl_all_tls_states[pg->thread_n]; pg->pool_n = p - ptls2->heap.norm_pools; memset(pg->ages, 0, GC_PAGE_SZ / 8 / p->osize + 1); jl_taggedvalue_t *beg = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET); @@ -1240,7 +1294,7 @@ static NOINLINE jl_taggedvalue_t *add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT pg->osize = p->osize; pg->ages = (uint8_t*)malloc_s(GC_PAGE_SZ / 8 / p->osize + 1); pg->thread_n = ptls->tid; - jl_taggedvalue_t *fl = reset_page(p, pg, NULL); + jl_taggedvalue_t *fl = reset_page(ptls, p, pg, NULL); p->newpages = fl; return fl; } @@ -1352,7 +1406,8 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t // FIXME - need to do accounting on a per-thread basis // on quick sweeps, keep a few pages empty but allocated for performance if (!sweep_full && lazy_freed_pages <= default_collect_interval / GC_PAGE_SZ) { - jl_taggedvalue_t *begin = reset_page(p, pg, p->newpages); + jl_ptls_t ptls2 = gc_all_tls_states[pg->thread_n]; + jl_taggedvalue_t *begin = reset_page(ptls2, p, pg, p->newpages); p->newpages = begin; begin->next = (jl_taggedvalue_t*)0; lazy_freed_pages++; @@ -1454,7 +1509,7 @@ static inline void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg { int p_n = pg->pool_n; int t_n = pg->thread_n; - jl_ptls_t ptls2 = jl_all_tls_states[t_n]; + jl_ptls_t ptls2 = gc_all_tls_states[t_n]; jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n]; int osize = pg->osize; pfl[t_n * JL_GC_N_POOLS + p_n] = sweep_page(p, pg, pfl[t_n * JL_GC_N_POOLS + p_n], sweep_full, osize); @@ -1566,9 +1621,9 @@ static void gc_sweep_pool(int sweep_full) gc_time_pool_start(); lazy_freed_pages = 0; - // For the benfit of the analyzer, which doesn't know that jl_n_threads + // For the benefit of the analyzer, which doesn't know that gc_n_threads // doesn't change over the course of this function - size_t n_threads = jl_n_threads; + size_t n_threads = gc_n_threads; // allocate enough space to hold the end of the free list chain // for every thread and pool size @@ -1577,7 +1632,13 @@ static void gc_sweep_pool(int sweep_full) // update metadata of pages that were pointed to by freelist or newpages from a pool // i.e. pages being the current allocation target for (int t_i = 0; t_i < n_threads; t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) { + for (int i = 0; i < JL_GC_N_POOLS; i++) { + pfl[t_i * JL_GC_N_POOLS + i] = NULL; + } + continue; + } for (int i = 0; i < JL_GC_N_POOLS; i++) { jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; jl_taggedvalue_t *last = p->freelist; @@ -1606,6 +1667,9 @@ static void gc_sweep_pool(int sweep_full) // null out terminal pointers of free lists for (int t_i = 0; t_i < n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) + continue; for (int i = 0; i < JL_GC_N_POOLS; i++) { *pfl[t_i * JL_GC_N_POOLS + i] = NULL; } @@ -2360,10 +2424,13 @@ stack: { } else { new_obj = (jl_value_t*)gc_read_stack(&rts[i], offset, lb, ub); - if (gc_ptr_tag(new_obj, 1)) { + if (gc_ptr_tag(new_obj, 3)) { // handle tagged pointers in finalizer list new_obj = gc_ptr_clear_tag(new_obj, 1); + // skip over the finalizer fptr i++; + if (gc_ptr_tag(new_obj, 2)) + continue; } } if (!gc_try_setmark(new_obj, &nptr, &tag, &bits)) @@ -2537,6 +2604,8 @@ finlist: { new_obj = *begin; if (__unlikely(!new_obj)) continue; + if (gc_ptr_tag(new_obj, 2)) + continue; if (gc_ptr_tag(new_obj, 1)) { new_obj = (jl_value_t*)gc_ptr_clear_tag(new_obj, 1); begin++; @@ -2716,7 +2785,7 @@ mark: { int16_t tid = jl_atomic_load_relaxed(&ta->tid); gc_invoke_callbacks(jl_gc_cb_task_scanner_t, gc_cblist_task_scanner, - (ta, tid != -1 && ta == jl_all_tls_states[tid]->root_task)); + (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task)); import_gc_state(ptls, &sp); } #ifdef COPY_STACKS @@ -2733,7 +2802,7 @@ mark: { if (stkbuf && ta->copy_stack && ta->ptls == NULL) { int16_t tid = jl_atomic_load_relaxed(&ta->tid); assert(tid >= 0); - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = gc_all_tls_states[tid]; ub = (uintptr_t)ptls2->stackbase; lb = ub - ta->copy_stack; offset = (uintptr_t)stkbuf - lb; @@ -2843,12 +2912,19 @@ mark: { static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2) { - gc_mark_queue_obj(gc_cache, sp, jl_atomic_load_relaxed(&ptls2->current_task)); - gc_mark_queue_obj(gc_cache, sp, ptls2->root_task); - if (ptls2->next_task) - gc_mark_queue_obj(gc_cache, sp, ptls2->next_task); - if (ptls2->previous_task) // shouldn't be necessary, but no reason not to - gc_mark_queue_obj(gc_cache, sp, ptls2->previous_task); + jl_task_t *task; + task = ptls2->root_task; + if (task) + gc_mark_queue_obj(gc_cache, sp, task); + task = jl_atomic_load_relaxed(&ptls2->current_task); + if (task) + gc_mark_queue_obj(gc_cache, sp, task); + task = ptls2->next_task; + if (task) + gc_mark_queue_obj(gc_cache, sp, task); + task = ptls2->previous_task; + if (task) // shouldn't be necessary, but no reason not to + gc_mark_queue_obj(gc_cache, sp, task); if (ptls2->previous_exception) gc_mark_queue_obj(gc_cache, sp, ptls2->previous_exception); } @@ -2897,17 +2973,25 @@ static void sweep_finalizer_list(arraylist_t *list) size_t j = 0; for (size_t i=0; i < len; i+=2) { void *v0 = items[i]; - void *v = gc_ptr_clear_tag(v0, 1); + void *v = gc_ptr_clear_tag(v0, 3); if (__unlikely(!v0)) { // remove from this list continue; } void *fin = items[i+1]; - int isfreed = !gc_marked(jl_astaggedvalue(v)->bits.gc); - int isold = (list != &finalizer_list_marked && + int isfreed; + int isold; + if (gc_ptr_tag(v, 2)) { + isfreed = 1; + isold = 0; + } + else { + isfreed = !gc_marked(jl_astaggedvalue(v)->bits.gc); + isold = (list != &finalizer_list_marked && jl_astaggedvalue(v)->bits.gc == GC_OLD_MARKED && jl_astaggedvalue(fin)->bits.gc == GC_OLD_MARKED); + } if (isfreed || isold) { // remove from this list } @@ -3090,11 +3174,18 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) uint64_t start_mark_time = jl_hrtime(); // 1. fix GC bits of objects in the remset. - for (int t_i = 0; t_i < jl_n_threads; t_i++) - jl_gc_premark(jl_all_tls_states[t_i]); + assert(gc_n_threads); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 != NULL) + jl_gc_premark(ptls2); + } - for (int t_i = 0; t_i < jl_n_threads; t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + assert(gc_n_threads); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) + continue; // 2.1. mark every object in the `last_remsets` and `rem_binding` jl_gc_queue_remset(gc_cache, &sp, ptls2); // 2.2. mark every thread local root @@ -3130,16 +3221,22 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // mark the object moved to the marked list from the // `finalizer_list` by `sweep_finalizer_list` size_t orig_marked_len = finalizer_list_marked.len; - for (int i = 0;i < jl_n_threads;i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; sweep_finalizer_list(&ptls2->finalizers); } if (prev_sweep_full) { sweep_finalizer_list(&finalizer_list_marked); orig_marked_len = 0; } - for (int i = 0;i < jl_n_threads;i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); } gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, orig_marked_len); @@ -3178,8 +3275,13 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // 5. next collection decision int not_freed_enough = (collection == JL_GC_AUTO) && estimate_freed < (7*(actual_allocd/10)); int nptr = 0; - for (int i = 0;i < jl_n_threads;i++) - nptr += jl_all_tls_states[i]->heap.remset_nptr; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; + nptr += ptls2->heap.remset_nptr; + } // many pointers in the intergen frontier => "quick" mark is not quick int large_frontier = nptr*sizeof(void*) >= default_collect_interval; @@ -3194,9 +3296,16 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (large_frontier) { sweep_full = 1; } - if (gc_num.interval > max_collect_interval) { + size_t maxmem = 0; +#ifdef _P64 + // on a big memory machine, increase max_collect_interval to totalmem / nthreads / 2 + maxmem = total_mem / gc_n_threads / 2; +#endif + if (maxmem < max_collect_interval) + maxmem = max_collect_interval; + if (gc_num.interval > maxmem) { sweep_full = 1; - gc_num.interval = max_collect_interval; + gc_num.interval = maxmem; } } @@ -3244,8 +3353,11 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // sweeping is over // 6. if it is a quick sweep, put back the remembered objects in queued state // so that we don't trigger the barrier again on them. - for (int t_i = 0;t_i < jl_n_threads;t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + assert(gc_n_threads); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) + continue; if (!sweep_full) { for (int i = 0; i < ptls2->heap.remset->len; i++) { jl_astaggedvalue(ptls2->heap.remset->items[i])->bits.gc = GC_MARKED; @@ -3357,9 +3469,17 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) #endif // Now we are ready to wait for other threads to hit the safepoint, // we can do a few things that doesn't require synchronization. - // TODO (concurrently queue objects) - // no-op for non-threading - jl_gc_wait_for_the_world(); + // + // We must sync here with the tls_lock operations, so that we have a + // seq-cst order between these events now we know that either the new + // thread must run into our safepoint flag or we must observe the + // existence of the thread in the jl_n_threads count. + // + // TODO: concurrently queue objects + jl_fence(); + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + jl_gc_wait_for_the_world(gc_all_tls_states, gc_n_threads); JL_PROBE_GC_STOP_THE_WORLD(); uint64_t t1 = jl_hrtime(); @@ -3382,7 +3502,8 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) JL_UNLOCK_NOGC(&finalizers_lock); } - // no-op for non-threading + gc_n_threads = 0; + gc_all_tls_states = NULL; jl_safepoint_end_gc(); jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING); JL_PROBE_GC_END(); @@ -3391,10 +3512,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) // Doing this on all threads is racy (it's impossible to check // or wait for finalizers on other threads without dead lock). if (!ptls->finalizers_inhibited && ptls->locks.len == 0) { - int8_t was_in_finalizer = ptls->in_finalizer; - ptls->in_finalizer = 1; run_finalizers(ct); - ptls->in_finalizer = was_in_finalizer; } JL_PROBE_GC_FINALIZER(); @@ -3409,8 +3527,12 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) { jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; - for (size_t i = 0; i < jl_n_threads; i++) - jl_gc_queue_thread_local(gc_cache, sp, jl_all_tls_states[i]); + assert(gc_n_threads); + for (size_t i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2) + jl_gc_queue_thread_local(gc_cache, sp, ptls2); + } mark_roots(gc_cache, sp); } @@ -3424,8 +3546,6 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty) // Per-thread initialization void jl_init_thread_heap(jl_ptls_t ptls) { - if (ptls->tid == 0) - ptls->disable_gc = 1; jl_thread_heap_t *heap = &ptls->heap; jl_gc_pool_t *p = heap->norm_pools; for (int i = 0; i < JL_GC_N_POOLS; i++) { @@ -3480,14 +3600,10 @@ void jl_gc_init(void) gc_num.max_memory = 0; #ifdef _P64 - // on a big memory machine, set max_collect_interval to totalmem / nthreads / 2 - uint64_t total_mem = uv_get_total_memory(); + total_mem = uv_get_total_memory(); uint64_t constrained_mem = uv_get_constrained_memory(); if (constrained_mem > 0 && constrained_mem < total_mem) total_mem = constrained_mem; - size_t maxmem = total_mem / jl_n_threads / 2; - if (maxmem > max_collect_interval) - max_collect_interval = maxmem; #endif // We allocate with abandon until we get close to the free memory on the machine. @@ -3942,7 +4058,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p) goto valid_object; } jl_gc_pool_t *pool = - jl_all_tls_states[meta->thread_n]->heap.norm_pools + + gc_all_tls_states[meta->thread_n]->heap.norm_pools + meta->pool_n; if (meta->fl_begin_offset == (uint16_t) -1) { // case 2: this is a page on the newpages list diff --git a/src/gc.h b/src/gc.h index 00c3d48b52935..29c904b796e45 100644 --- a/src/gc.h +++ b/src/gc.h @@ -393,6 +393,8 @@ extern bigval_t *big_objects_marked; extern arraylist_t finalizer_list_marked; extern arraylist_t to_finalize; extern int64_t lazy_freed_pages; +extern int gc_n_threads; +extern jl_ptls_t* gc_all_tls_states; STATIC_INLINE bigval_t *bigval_header(jl_taggedvalue_t *o) JL_NOTSAFEPOINT { diff --git a/src/gf.c b/src/gf.c index 1d36589a082f5..60a267873a321 100644 --- a/src/gf.c +++ b/src/gf.c @@ -2021,8 +2021,7 @@ static void record_precompile_statement(jl_method_instance_t *mi) if (!jl_is_method(def)) return; - if (jl_n_threads > 1) - JL_LOCK(&precomp_statement_out_lock); + JL_LOCK(&precomp_statement_out_lock); if (s_precompile == NULL) { const char *t = jl_options.trace_compile; if (!strncmp(t, "stderr", 6)) { @@ -2041,8 +2040,7 @@ static void record_precompile_statement(jl_method_instance_t *mi) if (s_precompile != JL_STDERR) ios_flush(&f_precompile); } - if (jl_n_threads > 1) - JL_UNLOCK(&precomp_statement_out_lock); + JL_UNLOCK(&precomp_statement_out_lock); } jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t world) diff --git a/src/init.c b/src/init.c index a5632fc66a45a..d5e3c60698af3 100644 --- a/src/init.c +++ b/src/init.c @@ -211,11 +211,9 @@ static void jl_close_item_atexit(uv_handle_t *handle) JL_DLLEXPORT void jl_atexit_hook(int exitcode) { - if (jl_all_tls_states == NULL) + if (jl_atomic_load_relaxed(&jl_all_tls_states) == NULL) return; - jl_task_t *ct = jl_current_task; - if (exitcode == 0) jl_write_compiler_output(); jl_print_gc_stats(JL_STDERR); @@ -223,7 +221,14 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) jl_write_coverage_data(jl_options.output_code_coverage); if (jl_options.malloc_log) jl_write_malloc_log(); + int8_t old_state; + jl_task_t *ct = NULL; if (jl_base_module) { + ct = jl_get_current_task(); + if (ct == NULL) { + ct = container_of(jl_adopt_thread(), jl_task_t, gcstack); + } + old_state = jl_gc_unsafe_enter(ct->ptls); jl_value_t *f = jl_get_global(jl_base_module, jl_symbol("_atexit")); if (f != NULL) { JL_TRY { @@ -246,7 +251,8 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) JL_STDOUT = (uv_stream_t*) STDOUT_FILENO; JL_STDERR = (uv_stream_t*) STDERR_FILENO; - jl_gc_run_all_finalizers(ct); + if (ct != NULL) + jl_gc_run_all_finalizers(ct); uv_loop_t *loop = jl_global_event_loop(); if (loop != NULL) { @@ -295,11 +301,13 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) #endif jl_teardown_codegen(); + if (ct != NULL) + jl_gc_unsafe_leave(ct->ptls, old_state); } JL_DLLEXPORT void jl_postoutput_hook(void) { - if (jl_all_tls_states == NULL) + if (jl_atomic_load_relaxed(&jl_all_tls_states) == NULL) return; if (jl_base_module) { @@ -777,7 +785,7 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_ if (jl_base_module == NULL) { // nthreads > 1 requires code in Base - jl_n_threads = 1; + jl_atomic_store_relaxed(&jl_n_threads, 1); } jl_start_threads(); diff --git a/src/jl_exported_data.inc b/src/jl_exported_data.inc index eae13a4ff285e..e426f2c1f8b42 100644 --- a/src/jl_exported_data.inc +++ b/src/jl_exported_data.inc @@ -130,7 +130,7 @@ // Data symbols that are defined inside the public libjulia #define JL_EXPORTED_DATA_SYMBOLS(XX) \ XX(jl_n_threadpools, int) \ - XX(jl_n_threads, int) \ + XX(jl_n_threads, _Atomic(int)) \ XX(jl_options, jl_options_t) \ // end of file diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index 1bd324325c57b..186dfc4a98ca7 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -3,6 +3,7 @@ #define JL_RUNTIME_EXPORTED_FUNCS(XX) \ XX(jl_active_task_stack) \ XX(jl_add_standard_imports) \ + XX(jl_adopt_thread) \ XX(jl_alignment) \ XX(jl_alloc_array_1d) \ XX(jl_alloc_array_2d) \ @@ -150,6 +151,7 @@ XX(jl_gc_add_finalizer) \ XX(jl_gc_add_finalizer_th) \ XX(jl_gc_add_ptr_finalizer) \ + XX(jl_gc_add_quiescent) \ XX(jl_gc_allocobj) \ XX(jl_gc_alloc_0w) \ XX(jl_gc_alloc_1w) \ diff --git a/src/jlapi.c b/src/jlapi.c index d1fb1e5aacf25..53a6c9b3c6859 100644 --- a/src/jlapi.c +++ b/src/jlapi.c @@ -96,9 +96,15 @@ JL_DLLEXPORT void jl_init_with_image__threading(const char *julia_bindir, jl_init_with_image(julia_bindir, image_relative_path); } +static void _jl_exception_clear(jl_task_t *ct) JL_NOTSAFEPOINT +{ + ct->ptls->previous_exception = NULL; +} + JL_DLLEXPORT jl_value_t *jl_eval_string(const char *str) { jl_value_t *r; + jl_task_t *ct = jl_current_task; JL_TRY { const char filename[] = "none"; jl_value_t *ast = jl_parse_all(str, strlen(str), @@ -106,10 +112,10 @@ JL_DLLEXPORT jl_value_t *jl_eval_string(const char *str) JL_GC_PUSH1(&ast); r = jl_toplevel_eval_in(jl_main_module, ast); JL_GC_POP(); - jl_exception_clear(); + _jl_exception_clear(ct); } JL_CATCH { - jl_current_task->ptls->previous_exception = jl_current_exception(); + ct->ptls->previous_exception = jl_current_exception(); r = NULL; } return r; @@ -128,7 +134,7 @@ JL_DLLEXPORT jl_value_t *jl_exception_occurred(void) JL_DLLEXPORT void jl_exception_clear(void) { - jl_current_task->ptls->previous_exception = NULL; + _jl_exception_clear(jl_current_task); } // get the name of a type as a string @@ -181,7 +187,7 @@ JL_DLLEXPORT jl_value_t *jl_call(jl_function_t *f, jl_value_t **args, uint32_t n v = jl_apply(argv, nargs); ct->world_age = last_age; JL_GC_POP(); - jl_exception_clear(); + _jl_exception_clear(ct); } JL_CATCH { ct->ptls->previous_exception = jl_current_exception(); @@ -201,7 +207,7 @@ JL_DLLEXPORT jl_value_t *jl_call0(jl_function_t *f) v = jl_apply_generic(f, NULL, 0); ct->world_age = last_age; JL_GC_POP(); - jl_exception_clear(); + _jl_exception_clear(ct); } JL_CATCH { ct->ptls->previous_exception = jl_current_exception(); @@ -224,7 +230,7 @@ JL_DLLEXPORT jl_value_t *jl_call1(jl_function_t *f, jl_value_t *a) v = jl_apply(argv, 2); ct->world_age = last_age; JL_GC_POP(); - jl_exception_clear(); + _jl_exception_clear(ct); } JL_CATCH { ct->ptls->previous_exception = jl_current_exception(); @@ -248,7 +254,7 @@ JL_DLLEXPORT jl_value_t *jl_call2(jl_function_t *f, jl_value_t *a, jl_value_t *b v = jl_apply(argv, 3); ct->world_age = last_age; JL_GC_POP(); - jl_exception_clear(); + _jl_exception_clear(ct); } JL_CATCH { ct->ptls->previous_exception = jl_current_exception(); @@ -261,6 +267,7 @@ JL_DLLEXPORT jl_value_t *jl_call3(jl_function_t *f, jl_value_t *a, jl_value_t *b, jl_value_t *c) { jl_value_t *v; + jl_task_t *ct = jl_current_task; JL_TRY { jl_value_t **argv; JL_GC_PUSHARGS(argv, 4); @@ -268,16 +275,15 @@ JL_DLLEXPORT jl_value_t *jl_call3(jl_function_t *f, jl_value_t *a, argv[1] = a; argv[2] = b; argv[3] = c; - jl_task_t *ct = jl_current_task; size_t last_age = ct->world_age; ct->world_age = jl_get_world_counter(); v = jl_apply(argv, 4); ct->world_age = last_age; JL_GC_POP(); - jl_exception_clear(); + _jl_exception_clear(ct); } JL_CATCH { - jl_current_task->ptls->previous_exception = jl_current_exception(); + ct->ptls->previous_exception = jl_current_exception(); v = NULL; } return v; @@ -560,8 +566,8 @@ static NOINLINE int true_main(int argc, char *argv[]) (jl_function_t*)jl_get_global(jl_base_module, jl_symbol("_start")) : NULL; if (start_client) { + jl_task_t *ct = jl_current_task; JL_TRY { - jl_task_t *ct = jl_current_task; size_t last_age = ct->world_age; ct->world_age = jl_get_world_counter(); jl_apply(&start_client, 1); diff --git a/src/julia.h b/src/julia.h index 19a94d8f2f15e..c06bb8e7fd5a0 100644 --- a/src/julia.h +++ b/src/julia.h @@ -895,6 +895,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t); JL_DLLEXPORT void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT; +JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_finalize(jl_value_t *o); JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value); JL_DLLEXPORT jl_value_t *jl_gc_alloc_0w(void); @@ -1658,7 +1659,7 @@ JL_DLLEXPORT jl_sym_t *jl_get_UNAME(void) JL_NOTSAFEPOINT; JL_DLLEXPORT jl_sym_t *jl_get_ARCH(void) JL_NOTSAFEPOINT; JL_DLLEXPORT jl_value_t *jl_get_libllvm(void) JL_NOTSAFEPOINT; extern JL_DLLIMPORT int jl_n_threadpools; -extern JL_DLLIMPORT int jl_n_threads; +extern JL_DLLIMPORT _Atomic(int) jl_n_threads; extern JL_DLLIMPORT int *jl_n_threads_per_pool; // environment entries @@ -1740,6 +1741,7 @@ JL_DLLEXPORT void jl_atexit_hook(int status); JL_DLLEXPORT void jl_postoutput_hook(void); JL_DLLEXPORT void JL_NORETURN jl_exit(int status); JL_DLLEXPORT const char *jl_pathname_for_handle(void *handle); +JL_DLLEXPORT jl_gcframe_t **jl_adopt_thread(void); JL_DLLEXPORT int jl_deserialize_verify_header(ios_t *s); JL_DLLEXPORT void jl_preload_sysimg_so(const char *fname); diff --git a/src/julia_gcext.h b/src/julia_gcext.h index 6787dafb4b7ee..6523198474771 100644 --- a/src/julia_gcext.h +++ b/src/julia_gcext.h @@ -76,10 +76,10 @@ JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, // Sweep functions will not automatically be called for objects of // foreign types, as that may not always be desired. Only calling // jl_gc_schedule_foreign_sweepfunc() on an object of a foreign type -// will result in the custome sweep function actually being called. +// will result in the custom sweep function actually being called. // This must be done at most once per object and should usually be // done right after allocating the object. -JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t * bj); +JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *bj); // The following functions enable support for conservative marking. This // functionality allows the user to determine if a machine word can be diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 1fab161a4766e..00072da00fd7d 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2684,12 +2684,12 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(std::vector &Colors, State // Insert GC frame stores PlaceGCFrameStores(S, AllocaSlot - 2, Colors, gcframe); // Insert GCFrame pops - for(Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { - if (isa(I->getTerminator())) { + for (auto &BB : *F) { + if (isa(BB.getTerminator())) { auto popGcframe = CallInst::Create( getOrDeclare(jl_intrinsics::popGCFrame), {gcframe}); - popGcframe->insertBefore(I->getTerminator()); + popGcframe->insertBefore(BB.getTerminator()); } } } diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index f0c0c6ee77b44..3b55339984516 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -23,7 +23,7 @@ JuliaPassContext::JuliaPassContext() tbaa_gcframe(nullptr), tbaa_tag(nullptr), - pgcstack_getter(nullptr), gc_flush_func(nullptr), + pgcstack_getter(nullptr), adoptthread_func(nullptr), gc_flush_func(nullptr), gc_preserve_begin_func(nullptr), gc_preserve_end_func(nullptr), pointer_from_objref_func(nullptr), alloc_obj_func(nullptr), typeof_func(nullptr), write_barrier_func(nullptr), @@ -44,6 +44,7 @@ void JuliaPassContext::initFunctions(Module &M) tbaa_tag = tbaa_make_child_with_context(llvmctx, "jtbaa_tag", tbaa_data_scalar).first; pgcstack_getter = M.getFunction("julia.get_pgcstack"); + adoptthread_func = M.getFunction("julia.get_pgcstack_or_new"); gc_flush_func = M.getFunction("julia.gcroot_flush"); gc_preserve_begin_func = M.getFunction("llvm.julia.gc_preserve_begin"); gc_preserve_end_func = M.getFunction("llvm.julia.gc_preserve_end"); @@ -70,10 +71,13 @@ void JuliaPassContext::initAll(Module &M) llvm::CallInst *JuliaPassContext::getPGCstack(llvm::Function &F) const { - for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); - pgcstack_getter && I != E; ++I) { - if (CallInst *callInst = dyn_cast(&*I)) { - if (callInst->getCalledOperand() == pgcstack_getter) { + if (!pgcstack_getter && !adoptthread_func) + return nullptr; + for (auto &I : F.getEntryBlock()) { + if (CallInst *callInst = dyn_cast(&I)) { + Value *callee = callInst->getCalledOperand(); + if ((pgcstack_getter && callee == pgcstack_getter) || + (adoptthread_func && callee == adoptthread_func)) { return callInst; } } diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h index 64d5dc00e2c5b..68f6efe42be6d 100644 --- a/src/llvm-pass-helpers.h +++ b/src/llvm-pass-helpers.h @@ -50,6 +50,7 @@ struct JuliaPassContext { // Intrinsics. llvm::Function *pgcstack_getter; + llvm::Function *adoptthread_func; llvm::Function *gc_flush_func; llvm::Function *gc_preserve_begin_func; llvm::Function *gc_preserve_end_func; diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp index e948e1c1a10bc..ad516d4ceb010 100644 --- a/src/llvm-ptls.cpp +++ b/src/llvm-ptls.cpp @@ -35,19 +35,19 @@ typedef Instruction TerminatorInst; namespace { struct LowerPTLS { - LowerPTLS(bool imaging_mode=false) - : imaging_mode(imaging_mode) + LowerPTLS(Module &M, bool imaging_mode=false) + : imaging_mode(imaging_mode), M(&M) {} - bool runOnModule(Module &M, bool *CFGModified); + bool run(bool *CFGModified); private: const bool imaging_mode; Module *M; - Function *pgcstack_getter; - MDNode *tbaa_const; - FunctionType *FT_pgcstack_getter; - PointerType *T_pgcstack_getter; - PointerType *T_pppjlvalue; + MDNode *tbaa_const{nullptr}; + MDNode *tbaa_gcframe{nullptr}; + FunctionType *FT_pgcstack_getter{nullptr}; + PointerType *T_pgcstack_getter{nullptr}; + PointerType *T_pppjlvalue{nullptr}; GlobalVariable *pgcstack_func_slot{nullptr}; GlobalVariable *pgcstack_key_slot{nullptr}; GlobalVariable *pgcstack_offset{nullptr}; @@ -55,7 +55,7 @@ struct LowerPTLS { Instruction *emit_pgcstack_tp(Value *offset, Instruction *insertBefore) const; template T *add_comdat(T *G) const; GlobalVariable *create_aliased_global(Type *T, StringRef name) const; - void fix_pgcstack_use(CallInst *pgcstack, bool *CFGModified); + void fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter, bool or_new, bool *CFGModified); }; void LowerPTLS::set_pgcstack_attrs(CallInst *pgcstack) const @@ -159,19 +159,77 @@ inline T *LowerPTLS::add_comdat(T *G) const return G; } -void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, bool *CFGModified) +void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter, bool or_new, bool *CFGModified) { if (pgcstack->use_empty()) { pgcstack->eraseFromParent(); return; } + if (or_new) { + // pgcstack(); + // if (pgcstack != nullptr) + // last_gc_state = emit_gc_unsafe_enter(ctx); + // phi = pgcstack; // fast + // else + // last_gc_state = gc_safe; + // phi = adopt(); // slow + // use phi; + // if (!retboxed) + // foreach(retinst) + // emit_gc_unsafe_leave(ctx, last_gc_state); + auto phi = PHINode::Create(pgcstack->getType(), 2, ""); + phi->insertAfter(pgcstack); + pgcstack->replaceAllUsesWith(phi); + MDBuilder MDB(pgcstack->getContext()); + SmallVector Weights{9, 1}; + TerminatorInst *fastTerm; + TerminatorInst *slowTerm; + auto cmp = new ICmpInst(phi, CmpInst::ICMP_NE, pgcstack, Constant::getNullValue(pgcstack->getType())); + SplitBlockAndInsertIfThenElse(cmp, phi, &fastTerm, &slowTerm, + MDB.createBranchWeights(Weights)); + if (CFGModified) + *CFGModified = true; + // emit slow branch code + CallInst *adopt = cast(pgcstack->clone()); + Function *adoptFunc = M->getFunction(XSTR(jl_adopt_thread)); + if (adoptFunc == NULL) { + adoptFunc = Function::Create(pgcstack_getter->getFunctionType(), + pgcstack_getter->getLinkage(), pgcstack_getter->getAddressSpace(), + XSTR(jl_adopt_thread), M); + adoptFunc->copyAttributesFrom(pgcstack_getter); + adoptFunc->copyMetadata(pgcstack_getter, 0); + } + adopt->setCalledFunction(adoptFunc); + adopt->insertBefore(slowTerm); + phi->addIncoming(adopt, slowTerm->getParent()); + // emit fast branch code + IRBuilder<> builder(fastTerm->getParent()); + fastTerm->removeFromParent(); + MDNode *tbaa = tbaa_gcframe; + Value *prior = emit_gc_unsafe_enter(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, pgcstack), tbaa)); + builder.Insert(fastTerm); + phi->addIncoming(pgcstack, fastTerm->getParent()); + // emit pre-return cleanup + if (CountTrackedPointers(pgcstack->getParent()->getParent()->getReturnType()).count == 0) { + auto last_gc_state = PHINode::Create(Type::getInt8Ty(pgcstack->getContext()), 2, "", phi); + // if we called jl_adopt_thread, we must end this cfunction back in the safe-state + last_gc_state->addIncoming(ConstantInt::get(Type::getInt8Ty(M->getContext()), JL_GC_STATE_SAFE), slowTerm->getParent()); + last_gc_state->addIncoming(prior, fastTerm->getParent()); + for (auto &BB : *pgcstack->getParent()->getParent()) { + if (isa(BB.getTerminator())) { + IRBuilder<> builder(BB.getTerminator()); + emit_gc_unsafe_leave(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, phi), tbaa), last_gc_state); + } + } + } + } if (imaging_mode) { if (jl_tls_elf_support) { // if (offset != 0) - // pgcstack = tp + offset; + // pgcstack = tp + offset; // fast // else - // pgcstack = getter(); + // pgcstack = getter(); // slow auto offset = new LoadInst(getSizeTy(pgcstack->getContext()), pgcstack_offset, "", false, pgcstack); offset->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const); offset->setMetadata(llvm::LLVMContext::MD_invariant_load, MDNode::get(pgcstack->getContext(), None)); @@ -184,7 +242,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, bool *CFGModified) SplitBlockAndInsertIfThenElse(cmp, pgcstack, &fastTerm, &slowTerm, MDB.createBranchWeights(Weights)); if (CFGModified) - *CFGModified = true; + *CFGModified = true; auto fastTLS = emit_pgcstack_tp(offset, fastTerm); auto phi = PHINode::Create(T_pppjlvalue, 2, "", pgcstack); @@ -248,37 +306,44 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, bool *CFGModified) } } -bool LowerPTLS::runOnModule(Module &_M, bool *CFGModified) +bool LowerPTLS::run(bool *CFGModified) { - M = &_M; - pgcstack_getter = M->getFunction("julia.get_pgcstack"); - if (!pgcstack_getter) - return false; + bool need_init = true; + auto runOnGetter = [&](bool or_new) { + Function *pgcstack_getter = M->getFunction(or_new ? "julia.get_pgcstack_or_new" : "julia.get_pgcstack"); + if (!pgcstack_getter) + return false; - tbaa_const = tbaa_make_child_with_context(_M.getContext(), "jtbaa_const", nullptr, true).first; + if (need_init) { + tbaa_const = tbaa_make_child_with_context(M->getContext(), "jtbaa_const", nullptr, true).first; + tbaa_gcframe = tbaa_make_child_with_context(M->getContext(), "jtbaa_gcframe").first; - FT_pgcstack_getter = pgcstack_getter->getFunctionType(); + FT_pgcstack_getter = pgcstack_getter->getFunctionType(); #if defined(_OS_DARWIN_) - assert(sizeof(jl_pgcstack_key_t) == sizeof(uintptr_t)); - FT_pgcstack_getter = FunctionType::get(FT_pgcstack_getter->getReturnType(), {getSizeTy(_M.getContext())}, false); + assert(sizeof(jl_pgcstack_key_t) == sizeof(uintptr_t)); + FT_pgcstack_getter = FunctionType::get(FT_pgcstack_getter->getReturnType(), {getSizeTy(M->getContext())}, false); #endif - T_pgcstack_getter = FT_pgcstack_getter->getPointerTo(); - T_pppjlvalue = cast(FT_pgcstack_getter->getReturnType()); - if (imaging_mode) { - pgcstack_func_slot = create_aliased_global(T_pgcstack_getter, "jl_pgcstack_func_slot"); - pgcstack_key_slot = create_aliased_global(getSizeTy(_M.getContext()), "jl_pgcstack_key_slot"); // >= sizeof(jl_pgcstack_key_t) - pgcstack_offset = create_aliased_global(getSizeTy(_M.getContext()), "jl_tls_offset"); - } + T_pgcstack_getter = FT_pgcstack_getter->getPointerTo(); + T_pppjlvalue = cast(FT_pgcstack_getter->getReturnType()); + if (imaging_mode) { + pgcstack_func_slot = create_aliased_global(T_pgcstack_getter, "jl_pgcstack_func_slot"); + pgcstack_key_slot = create_aliased_global(getSizeTy(M->getContext()), "jl_pgcstack_key_slot"); // >= sizeof(jl_pgcstack_key_t) + pgcstack_offset = create_aliased_global(getSizeTy(M->getContext()), "jl_tls_offset"); + } + need_init = false; + } - for (auto it = pgcstack_getter->user_begin(); it != pgcstack_getter->user_end();) { - auto call = cast(*it); - ++it; - assert(call->getCalledOperand() == pgcstack_getter); - fix_pgcstack_use(call, CFGModified); - } - assert(pgcstack_getter->use_empty()); - pgcstack_getter->eraseFromParent(); - return true; + for (auto it = pgcstack_getter->user_begin(); it != pgcstack_getter->user_end();) { + auto call = cast(*it); + ++it; + assert(call->getCalledOperand() == pgcstack_getter); + fix_pgcstack_use(call, pgcstack_getter, or_new, CFGModified); + } + assert(pgcstack_getter->use_empty()); + pgcstack_getter->eraseFromParent(); + return true; + }; + return runOnGetter(false) + runOnGetter(true); } struct LowerPTLSLegacy: public ModulePass { @@ -290,8 +355,8 @@ struct LowerPTLSLegacy: public ModulePass { bool imaging_mode; bool runOnModule(Module &M) override { - LowerPTLS lower(imaging_mode); - return lower.runOnModule(M, nullptr); + LowerPTLS lower(M, imaging_mode); + return lower.run(nullptr); } }; @@ -304,9 +369,9 @@ static RegisterPass X("LowerPTLS", "LowerPTLS Pass", } // anonymous namespace PreservedAnalyses LowerPTLSPass::run(Module &M, ModuleAnalysisManager &AM) { - LowerPTLS lower(imaging_mode); + LowerPTLS lower(M, imaging_mode); bool CFGModified = false; - if (lower.runOnModule(M, &CFGModified)) { + if (lower.run(&CFGModified)) { if (CFGModified) { return PreservedAnalyses::none(); } else { diff --git a/src/partr.c b/src/partr.c index eeb0d0f456d97..ec6bbe3e5720a 100644 --- a/src/partr.c +++ b/src/partr.c @@ -26,6 +26,9 @@ static const int16_t not_sleeping = 0; // it is acceptable for the thread to be sleeping. static const int16_t sleeping = 1; +// this thread is dead. +static const int16_t sleeping_like_the_dead JL_UNUSED = 2; + // invariant: No thread is ever asleep unless sleep_check_state is sleeping (or we have a wakeup signal pending). // invariant: Any particular thread is not asleep unless that thread's sleep_check_state is sleeping. // invariant: The transition of a thread state to sleeping must be followed by a check that there wasn't work pending for it. @@ -182,7 +185,7 @@ static int sleep_check_after_threshold(uint64_t *start_cycles) static int wake_thread(int16_t tid) { - jl_ptls_t other = jl_all_tls_states[tid]; + jl_ptls_t other = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; int8_t state = sleeping; if (jl_atomic_load_relaxed(&other->sleep_check_state) == sleeping) { @@ -229,7 +232,7 @@ JL_DLLEXPORT void jl_wakeup_thread(int16_t tid) if (wake_thread(tid)) { // check if we need to notify uv_run too jl_fence(); - jl_ptls_t other = jl_all_tls_states[tid]; + jl_ptls_t other = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; jl_task_t *tid_task = jl_atomic_load_relaxed(&other->current_task); // now that we have changed the thread to not-sleeping, ensure that // either it has not yet acquired the libuv lock, or that it will @@ -244,7 +247,8 @@ JL_DLLEXPORT void jl_wakeup_thread(int16_t tid) // in the future, we might want to instead wake some fraction of threads, // and let each of those wake additional threads if they find work int anysleep = 0; - for (tid = 0; tid < jl_n_threads; tid++) { + int nthreads = jl_atomic_load_acquire(&jl_n_threads); + for (tid = 0; tid < nthreads; tid++) { if (tid != self) anysleep |= wake_thread(tid); } diff --git a/src/safepoint.c b/src/safepoint.c index b2feccf74e068..1ff26d616a5d8 100644 --- a/src/safepoint.c +++ b/src/safepoint.c @@ -111,10 +111,6 @@ void jl_safepoint_init(void) int jl_safepoint_start_gc(void) { - if (jl_n_threads == 1) { - jl_atomic_store_relaxed(&jl_gc_running, 1); - return 1; - } // The thread should have set this already assert(jl_atomic_load_relaxed(&jl_current_task->ptls->gc_state) == JL_GC_STATE_WAITING); uv_mutex_lock(&safepoint_lock); @@ -137,10 +133,6 @@ int jl_safepoint_start_gc(void) void jl_safepoint_end_gc(void) { assert(jl_atomic_load_relaxed(&jl_gc_running)); - if (jl_n_threads == 1) { - jl_atomic_store_relaxed(&jl_gc_running, 0); - return; - } uv_mutex_lock(&safepoint_lock); // Need to reset the page protection before resetting the flag since // the thread will trigger a segfault immediately after returning from diff --git a/src/signal-handling.c b/src/signal-handling.c index 43782bf4070f2..698b1e21febb1 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -182,14 +182,10 @@ static int *profile_get_randperm(int size) JL_DLLEXPORT int jl_profile_is_buffer_full(void) { - // declare buffer full if there isn't enough room to take samples across all threads - #if defined(_OS_WINDOWS_) - uint64_t nthreads = 1; // windows only profiles the main thread - #else - uint64_t nthreads = jl_n_threads; - #endif - // the `+ 6` is for the two block terminators `0` plus 4 metadata entries - return bt_size_cur + (((JL_BT_MAX_ENTRY_SIZE + 1) + 6) * nthreads) > bt_size_max; + // Declare buffer full if there isn't enough room to sample even just the + // thread metadata and one max-sized frame. The `+ 6` is for the two block + // terminator `0`'s plus the 4 metadata entries. + return bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > bt_size_max; } static uint64_t jl_last_sigint_trigger = 0; diff --git a/src/signals-mach.c b/src/signals-mach.c index 5a1816a80f2b2..edc2b42215f67 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -50,7 +50,7 @@ void jl_mach_gc_end(void) uintptr_t item = (uintptr_t)suspended_threads.items[i]; int16_t tid = (int16_t)item; int8_t gc_state = (int8_t)(item >> 8); - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; jl_atomic_store_release(&ptls2->gc_state, gc_state); thread_resume(pthread_mach_thread_np(ptls2->system_id)); } @@ -119,7 +119,8 @@ static void allocate_mach_handler() if (_keymgr_set_lockmode_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, NM_ALLOW_RECURSION)) jl_error("_keymgr_set_lockmode_processwide_ptr failed"); - arraylist_new(&suspended_threads, jl_n_threads); + int16_t nthreads = jl_atomic_load_acquire(&jl_n_threads); + arraylist_new(&suspended_threads, nthreads); // we will resize later (inside safepoint_lock), if needed pthread_t thread; pthread_attr_t attr; kern_return_t ret; @@ -221,7 +222,7 @@ static void jl_throw_in_thread(int tid, mach_port_t thread, jl_value_t *exceptio host_thread_state_t state; kern_return_t ret = thread_get_state(thread, MACH_THREAD_STATE, (thread_state_t)&state, &count); HANDLE_MACH_ERROR("thread_get_state", ret); - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; if (!jl_get_safe_restore()) { assert(exception); ptls2->bt_size = @@ -265,8 +266,9 @@ kern_return_t catch_mach_exception_raise( #endif int16_t tid; jl_ptls_t ptls2 = NULL; - for (tid = 0; tid < jl_n_threads; tid++) { - jl_ptls_t _ptls2 = jl_all_tls_states[tid]; + int nthreads = jl_atomic_load_acquire(&jl_n_threads); + for (tid = 0; tid < nthreads; tid++) { + jl_ptls_t _ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; if (pthread_mach_thread_np(_ptls2->system_id) == thread) { ptls2 = _ptls2; break; @@ -381,9 +383,15 @@ static void attach_exception_port(thread_port_t thread, int segv_only) HANDLE_MACH_ERROR("thread_set_exception_ports", ret); } -static void jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) +static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) { - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + if (ptls2 == NULL) // this thread is not alive + return 0; + jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL; + if (ct2 == NULL) // this thread is already dead + return 0; + mach_port_t thread = pthread_mach_thread_np(ptls2->system_id); kern_return_t ret = thread_suspend(thread); @@ -395,18 +403,22 @@ static void jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) // Get the state of the suspended thread ret = thread_get_state(thread, MACH_THREAD_STATE, (thread_state_t)ctx, &count); + return 1; } static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx) { static host_thread_state_t state; - jl_thread_suspend_and_get_state2(tid, &state); + if (!jl_thread_suspend_and_get_state2(tid, &state)) { + *ctx = NULL; + return; + } *ctx = (unw_context_t*)&state; } static void jl_thread_resume(int tid, int sig) { - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; mach_port_t thread = pthread_mach_thread_np(ptls2->system_id); kern_return_t ret = thread_resume(thread); HANDLE_MACH_ERROR("thread_resume", ret); @@ -416,7 +428,7 @@ static void jl_thread_resume(int tid, int sig) // or if SIGINT happens too often. static void jl_try_deliver_sigint(void) { - jl_ptls_t ptls2 = jl_all_tls_states[0]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; mach_port_t thread = pthread_mach_thread_np(ptls2->system_id); kern_return_t ret = thread_suspend(thread); @@ -452,11 +464,12 @@ CFI_NORETURN static void jl_exit_thread0(int exitstate, jl_bt_element_t *bt_data, size_t bt_size) { - jl_ptls_t ptls2 = jl_all_tls_states[0]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; mach_port_t thread = pthread_mach_thread_np(ptls2->system_id); host_thread_state_t state; - jl_thread_suspend_and_get_state2(0, &state); + if (!jl_thread_suspend_and_get_state2(0, &state)) + return; unw_context_t *uc = (unw_context_t*)&state; // This aborts `sleep` and other syscalls. @@ -608,8 +621,9 @@ void *mach_profile_listener(void *arg) // (so that thread zero gets notified last) int keymgr_locked = jl_lock_profile_mach(0); - int *randperm = profile_get_randperm(jl_n_threads); - for (int idx = jl_n_threads; idx-- > 0; ) { + int nthreads = jl_atomic_load_acquire(&jl_n_threads); + int *randperm = profile_get_randperm(nthreads); + for (int idx = nthreads; idx-- > 0; ) { // Stop the threads in the random or reverse round-robin order. int i = randperm[idx]; // if there is no space left, break early @@ -621,7 +635,8 @@ void *mach_profile_listener(void *arg) if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) _dyld_atfork_prepare(); // briefly acquire the dlsym lock host_thread_state_t state; - jl_thread_suspend_and_get_state2(i, &state); + if (!jl_thread_suspend_and_get_state2(i, &state)) + continue; unw_context_t *uc = (unw_context_t*)&state; if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) _dyld_atfork_parent(); // quickly release the dlsym lock @@ -660,12 +675,12 @@ void *mach_profile_listener(void *arg) #else bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); #endif - jl_ptls_t ptls = jl_all_tls_states[i]; + jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[i]; // store threadid but add 1 as 0 is preserved to indicate end of block bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; - // store task id + // store task id (never null) bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); // store cpu cycle clock diff --git a/src/signals-unix.c b/src/signals-unix.c index dadbd15de0832..1716bb21883af 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -372,7 +372,14 @@ static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx) clock_gettime(CLOCK_REALTIME, &ts); ts.tv_sec += 1; pthread_mutex_lock(&in_signal_lock); - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL; + if (ct2 == NULL) { + // this thread is not alive or already dead + *ctx = NULL; + pthread_mutex_unlock(&in_signal_lock); + return; + } jl_atomic_store_release(&ptls2->signal_request, 1); pthread_kill(ptls2->system_id, SIGUSR2); // wait for thread to acknowledge @@ -404,7 +411,7 @@ static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx) static void jl_thread_resume(int tid, int sig) { - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; jl_atomic_store_release(&ptls2->signal_request, sig == -1 ? 3 : 1); pthread_cond_broadcast(&exit_signal_cond); pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge @@ -420,7 +427,7 @@ static void jl_thread_resume(int tid, int sig) // or if SIGINT happens too often. static void jl_try_deliver_sigint(void) { - jl_ptls_t ptls2 = jl_all_tls_states[0]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; jl_safepoint_enable_sigint(); jl_wake_libuv(); jl_atomic_store_release(&ptls2->signal_request, 2); @@ -451,7 +458,7 @@ CFI_NORETURN static void jl_exit_thread0(int state, jl_bt_element_t *bt_data, size_t bt_size) { - jl_ptls_t ptls2 = jl_all_tls_states[0]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; if (thread0_exit_count <= 1) { unw_context_t *signal_context; jl_thread_suspend_and_get_state(0, &signal_context); @@ -701,7 +708,7 @@ void trigger_profile_peek(void) if (bt_size_max == 0){ // If the buffer hasn't been initialized, initialize with default size // Keep these values synchronized with Profile.default_init() - if (jl_profile_init(10000000 * jl_n_threads, 1000000) == -1){ + if (jl_profile_init(10000000, 1000000) == -1) { jl_safe_printf("ERROR: could not initialize the profile buffer"); return; } @@ -831,6 +838,7 @@ static void *signal_listener(void *arg) } #endif + int nthreads = jl_atomic_load_acquire(&jl_n_threads); bt_size = 0; #if !defined(JL_DISABLE_LIBUNWIND) unw_context_t *signal_context; @@ -840,8 +848,8 @@ static void *signal_listener(void *arg) jl_lock_profile(); int *randperm; if (profile) - randperm = profile_get_randperm(jl_n_threads); - for (int idx = jl_n_threads; idx-- > 0; ) { + randperm = profile_get_randperm(nthreads); + for (int idx = nthreads; idx-- > 0; ) { // Stop the threads in the random or reverse round-robin order. int i = profile ? randperm[idx] : idx; // notify thread to stop @@ -853,7 +861,7 @@ static void *signal_listener(void *arg) // this part must be signal-handler safe if (critical) { bt_size += rec_backtrace_ctx(bt_data + bt_size, - JL_MAX_BT_SIZE / jl_n_threads - 1, + JL_MAX_BT_SIZE / nthreads - 1, signal_context, NULL); bt_data[bt_size++].uintptr = 0; } @@ -880,12 +888,12 @@ static void *signal_listener(void *arg) } jl_set_safe_restore(old_buf); - jl_ptls_t ptls2 = jl_all_tls_states[i]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[i]; // store threadid but add 1 as 0 is preserved to indicate end of block bt_data_prof[bt_size_cur++].uintptr = ptls2->tid + 1; - // store task id + // store task id (never null) bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task); // store cpu cycle clock @@ -927,11 +935,11 @@ static void *signal_listener(void *arg) else { #ifndef SIGINFO // SIGINFO already prints this automatically int nrunning = 0; - for (int idx = jl_n_threads; idx-- > 0; ) { - jl_ptls_t ptls2 = jl_all_tls_states[idx]; + for (int idx = nthreads; idx-- > 0; ) { + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[idx]; nrunning += !jl_atomic_load_relaxed(&ptls2->sleep_check_state); } - jl_safe_printf("\ncmd: %s %d running %d of %d\n", jl_options.julia_bin ? jl_options.julia_bin : "julia", uv_os_getpid(), nrunning, jl_n_threads); + jl_safe_printf("\ncmd: %s %d running %d of %d\n", jl_options.julia_bin ? jl_options.julia_bin : "julia", uv_os_getpid(), nrunning, nthreads); #endif jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig)); diff --git a/src/signals-win.c b/src/signals-win.c index 178a7463b8d50..83e92ff400e1d 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -165,7 +165,7 @@ HANDLE hMainThread = INVALID_HANDLE_VALUE; // Try to throw the exception in the master thread. static void jl_try_deliver_sigint(void) { - jl_ptls_t ptls2 = jl_all_tls_states[0]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; jl_lock_profile(); jl_safepoint_enable_sigint(); jl_wake_libuv(); @@ -362,12 +362,12 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, &ctxThread, NULL); - jl_ptls_t ptls = jl_all_tls_states[0]; // given only profiling hMainThread + jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread // store threadid but add 1 as 0 is preserved to indicate end of block bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; - // store task id + // store task id (never null) bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); // store cpu cycle clock diff --git a/src/task.c b/src/task.c index 22a5ad214e0b8..427c16b2481ab 100644 --- a/src/task.c +++ b/src/task.c @@ -265,7 +265,8 @@ JL_DLLEXPORT void *jl_task_stack_buffer(jl_task_t *task, size_t *size, int *ptid { size_t off = 0; #ifndef _OS_WINDOWS_ - if (jl_all_tls_states[0]->root_task == task) { + jl_ptls_t ptls0 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; + if (ptls0->root_task == task) { // See jl_init_root_task(). The root task of the main thread // has its buffer enlarged by an artificial 3000000 bytes, but // that means that the start of the buffer usually points to @@ -306,7 +307,8 @@ JL_DLLEXPORT void jl_active_task_stack(jl_task_t *task, else if (task->stkbuf) { *total_start = *active_start = (char*)task->stkbuf; #ifndef _OS_WINDOWS_ - if (jl_all_tls_states[0]->root_task == task) { + jl_ptls_t ptls0 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; + if (ptls0->root_task == task) { // See jl_init_root_task(). The root task of the main thread // has its buffer enlarged by an artificial 3000000 bytes, but // that means that the start of the buffer usually points to diff --git a/src/threading.c b/src/threading.c index 2cebdb22fc0aa..581032092168c 100644 --- a/src/threading.c +++ b/src/threading.c @@ -46,12 +46,16 @@ JL_DLLEXPORT void *jl_get_ptls_states(void) return jl_current_task->ptls; } +static void jl_delete_thread(void*); + #if !defined(_OS_WINDOWS_) +static pthread_key_t jl_task_exit_key; static pthread_key_t jl_safe_restore_key; __attribute__((constructor)) void _jl_init_safe_restore(void) { pthread_key_create(&jl_safe_restore_key, NULL); + pthread_key_create(&jl_task_exit_key, jl_delete_thread); } JL_DLLEXPORT jl_jmp_buf *jl_get_safe_restore(void) @@ -124,21 +128,26 @@ static DWORD jl_safe_restore_key; BOOLEAN WINAPI DllMain(IN HINSTANCE hDllHandle, IN DWORD nReason, IN LPVOID Reserved) { + jl_task_t *ct; switch (nReason) { case DLL_PROCESS_ATTACH: jl_pgcstack_key = TlsAlloc(); assert(jl_pgcstack_key != TLS_OUT_OF_INDEXES); jl_safe_restore_key = TlsAlloc(); assert(jl_safe_restore_key != TLS_OUT_OF_INDEXES); - // Fall through - case DLL_THREAD_ATTACH: - break; - case DLL_THREAD_DETACH: break; case DLL_PROCESS_DETACH: TlsFree(jl_pgcstack_key); TlsFree(jl_safe_restore_key); break; + case DLL_THREAD_ATTACH: + // will call jl_adopt_thread lazily on-demand + break; + case DLL_THREAD_DETACH: + ct = jl_get_current_task(); + if (ct != NULL) + jl_delete_thread((void*)ct->ptls); + break; } return 1; // success } @@ -291,7 +300,8 @@ void jl_pgcstack_getkey(jl_get_pgcstack_func **f, jl_pgcstack_key_t *k) #endif static uv_mutex_t tls_lock; // controls write-access to these variables: -jl_ptls_t *jl_all_tls_states JL_GLOBALLY_ROOTED; +_Atomic(jl_ptls_t*) jl_all_tls_states JL_GLOBALLY_ROOTED; +int jl_all_tls_states_size; static uv_cond_t cond; // return calling thread's ID @@ -302,7 +312,8 @@ JL_DLLEXPORT int16_t jl_threadid(void) JL_DLLEXPORT int8_t jl_threadpoolid(int16_t tid) JL_NOTSAFEPOINT { - if (tid < 0 || tid >= jl_n_threads) + int nthreads = jl_atomic_load_acquire(&jl_n_threads); + if (tid < 0 || tid >= nthreads) jl_error("invalid tid"); int n = 0; for (int i = 0; i < jl_n_threadpools; i++) { @@ -310,14 +321,25 @@ JL_DLLEXPORT int8_t jl_threadpoolid(int16_t tid) JL_NOTSAFEPOINT if (tid < n) return (int8_t)i; } - jl_error("internal error: couldn't determine threadpool id"); + return 0; // everything else uses threadpool 0 (though does not become part of any threadpool) } jl_ptls_t jl_init_threadtls(int16_t tid) { +#ifndef _OS_WINDOWS_ + if (pthread_getspecific(jl_task_exit_key)) + abort(); +#endif + if (jl_get_pgcstack() != NULL) + abort(); jl_ptls_t ptls = (jl_ptls_t)calloc(1, sizeof(jl_tls_states_t)); +#ifndef _OS_WINDOWS_ + pthread_setspecific(jl_task_exit_key, (void*)ptls); +#endif ptls->system_id = (jl_thread_t)(uintptr_t)uv_thread_self(); ptls->rngseed = jl_rand(); + if (tid == 0) + ptls->disable_gc = 1; #ifdef _OS_WINDOWS_ if (tid == 0) { if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(), @@ -328,7 +350,6 @@ jl_ptls_t jl_init_threadtls(int16_t tid) } } #endif - ptls->tid = tid; jl_atomic_store_relaxed(&ptls->gc_state, 0); // GC unsafe // Conditionally initialize the safepoint address. See comment in // `safepoint.c` @@ -349,11 +370,80 @@ jl_ptls_t jl_init_threadtls(int16_t tid) uv_mutex_init(&ptls->sleep_lock); uv_cond_init(&ptls->wake_signal); - jl_all_tls_states[tid] = ptls; + uv_mutex_lock(&tls_lock); + jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states); + if (tid == -1) + tid = jl_atomic_load_relaxed(&jl_n_threads); + ptls->tid = tid; + if (jl_all_tls_states_size <= tid) { + int i, newsize = jl_all_tls_states_size + tid + 2; + jl_ptls_t *newpptls = (jl_ptls_t*)calloc(newsize, sizeof(jl_ptls_t)); + for (i = 0; i < jl_all_tls_states_size; i++) { + newpptls[i] = allstates[i]; + } + jl_atomic_store_release(&jl_all_tls_states, newpptls); + jl_all_tls_states_size = newsize; + jl_gc_add_quiescent(ptls, (void**)allstates, free); + allstates = newpptls; + } + allstates[tid] = ptls; + if (jl_atomic_load_relaxed(&jl_n_threads) < tid + 1) + jl_atomic_store_release(&jl_n_threads, tid + 1); + jl_fence(); + uv_mutex_unlock(&tls_lock); return ptls; } +JL_DLLEXPORT jl_gcframe_t **jl_adopt_thread(void) +{ + // initialize this thread (assign tid, create heap, set up root task) + jl_ptls_t ptls = jl_init_threadtls(-1); + void *stack_lo, *stack_hi; + jl_init_stack_limits(0, &stack_lo, &stack_hi); + + (void)jl_gc_unsafe_enter(ptls); + // warning: this changes `jl_current_task`, so be careful not to call that from this function + jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi); + JL_GC_PROMISE_ROOTED(ct); + + return &ct->gcstack; +} + +static void jl_delete_thread(void *value) +{ + jl_ptls_t ptls = (jl_ptls_t)value; + // Acquire the profile write lock, to ensure we are not racing with the `kill` + // call in the profile code which will also try to look at these variables. + // We have no control over when the user calls pthread_join, so we must do + // this here by blocking. This also synchronizes our read of `current_task` + // (which is the flag we currently use to check the liveness state of a thread). +#ifdef _OS_WINDOWS_ + jl_lock_profile_wr(); +#elif defined(JL_DISABLE_LIBUNWIND) + // nothing +#elif defined(__APPLE__) + jl_lock_profile_wr(); +#else + pthread_mutex_lock(&in_signal_lock); +#endif +#ifndef _OS_WINDOWS_ + pthread_setspecific(jl_task_exit_key, NULL); +#endif + jl_atomic_store_relaxed(&ptls->current_task, NULL); // dead + jl_atomic_store_relaxed(&ptls->sleep_check_state, 2); // dead, interpreted as sleeping and unwakeable +#ifdef _OS_WINDOWS_ + jl_unlock_profile_wr(); +#elif defined(JL_DISABLE_LIBUNWIND) + // nothing +#elif defined(__APPLE__) + jl_unlock_profile_wr(); +#else + pthread_mutex_unlock(&in_signal_lock); +#endif + (void)jl_gc_safe_enter(ptls); +} + JL_DLLEXPORT jl_mutex_t jl_codegen_lock; jl_mutex_t typecache_lock; @@ -467,7 +557,6 @@ void jl_init_threading(void) uv_mutex_init(&tls_lock); uv_cond_init(&cond); - #ifdef JL_ELF_TLS_VARIANT jl_check_tls(); #endif @@ -477,8 +566,8 @@ void jl_init_threading(void) // environment variable. Set the globals `jl_n_threadpools`, `jl_n_threads` // and `jl_n_threads_per_pool`. jl_n_threadpools = 1; - jl_n_threads = JULIA_NUM_THREADS; - int16_t nthreads = jl_n_threads, nthreadsi = 0; + int16_t nthreads = JULIA_NUM_THREADS; + int16_t nthreadsi = 0; char *endptr, *endptri; if (jl_options.nthreads != 0) { // --threads specified @@ -516,26 +605,26 @@ void jl_init_threading(void) } } - jl_n_threads = nthreads + nthreadsi; - jl_n_threads_per_pool = (int *)malloc(2 * sizeof(int)); + jl_all_tls_states_size = nthreads + nthreadsi; + jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int)); jl_n_threads_per_pool[0] = nthreads; jl_n_threads_per_pool[1] = nthreadsi; -#ifndef __clang_gcanalyzer__ - jl_all_tls_states = (jl_ptls_t*)calloc(jl_n_threads, sizeof(void*)); -#endif + jl_atomic_store_release(&jl_all_tls_states, (jl_ptls_t*)calloc(jl_all_tls_states_size, sizeof(jl_ptls_t))); + jl_atomic_store_release(&jl_n_threads, jl_all_tls_states_size); } static uv_barrier_t thread_init_done; void jl_start_threads(void) { + int nthreads = jl_atomic_load_relaxed(&jl_n_threads); int cpumasksize = uv_cpumask_size(); char *cp; int i, exclusive; uv_thread_t uvtid; - if (cpumasksize < jl_n_threads) // also handles error case - cpumasksize = jl_n_threads; + if (cpumasksize < nthreads) // also handles error case + cpumasksize = nthreads; char *mask = (char*)alloca(cpumasksize); // do we have exclusive use of the machine? default is no @@ -548,7 +637,7 @@ void jl_start_threads(void) // according to a 'compact' policy // non-exclusive: no affinity settings; let the kernel move threads about if (exclusive) { - if (jl_n_threads > jl_cpu_threads()) { + if (nthreads > jl_cpu_threads()) { jl_printf(JL_STDERR, "ERROR: Too many threads requested for %s option.\n", MACHINE_EXCLUSIVE_NAME); exit(1); } @@ -559,9 +648,6 @@ void jl_start_threads(void) mask[0] = 0; } - // The analyzer doesn't know jl_n_threads doesn't change, help it - size_t nthreads = jl_n_threads; - // create threads uv_barrier_init(&thread_init_done, nthreads); diff --git a/src/threading.h b/src/threading.h index 4c6f1e19881f5..9fd63f0fd188d 100644 --- a/src/threading.h +++ b/src/threading.h @@ -12,7 +12,7 @@ extern "C" { #define PROFILE_JL_THREADING 0 -extern jl_ptls_t *jl_all_tls_states JL_GLOBALLY_ROOTED; /* thread local storage */ +extern _Atomic(jl_ptls_t*) jl_all_tls_states JL_GLOBALLY_ROOTED; /* thread local storage */ typedef struct _jl_threadarg_t { int16_t tid; diff --git a/stdlib/Distributed/test/distributed_exec.jl b/stdlib/Distributed/test/distributed_exec.jl index 8ed55550e61b9..116f489677996 100644 --- a/stdlib/Distributed/test/distributed_exec.jl +++ b/stdlib/Distributed/test/distributed_exec.jl @@ -153,12 +153,12 @@ function _getenv_include_thread_unsafe() end const _env_include_thread_unsafe = _getenv_include_thread_unsafe() function include_thread_unsafe_tests() - if Threads.nthreads() > 1 + if Threads.maxthreadid() > 1 if _env_include_thread_unsafe return true end - msg = "Skipping a thread-unsafe test because `Threads.nthreads() > 1`" - @warn msg Threads.nthreads() + msg = "Skipping a thread-unsafe test because `Threads.maxthreadid() > 1`" + @warn msg Threads.maxthreadid() Test.@test_broken false return false end diff --git a/stdlib/InteractiveUtils/src/InteractiveUtils.jl b/stdlib/InteractiveUtils/src/InteractiveUtils.jl index 4621ed07ed124..8d0f23f5756ce 100644 --- a/stdlib/InteractiveUtils/src/InteractiveUtils.jl +++ b/stdlib/InteractiveUtils/src/InteractiveUtils.jl @@ -141,7 +141,7 @@ function versioninfo(io::IO=stdout; verbose::Bool=false) println(io, " WORD_SIZE: ", Sys.WORD_SIZE) println(io, " LIBM: ",Base.libm_name) println(io, " LLVM: libLLVM-",Base.libllvm_version," (", Sys.JIT, ", ", Sys.CPU_NAME, ")") - println(io, " Threads: ", Threads.nthreads(), " on ", Sys.CPU_THREADS, " virtual cores") + println(io, " Threads: ", Threads.maxthreadid(), " on ", Sys.CPU_THREADS, " virtual cores") function is_nonverbose_env(k::String) return occursin(r"^JULIA_|^DYLD_|^LD_", k) diff --git a/stdlib/LinearAlgebra/src/LinearAlgebra.jl b/stdlib/LinearAlgebra/src/LinearAlgebra.jl index 030844a9e88e7..e4af012cc5f13 100644 --- a/stdlib/LinearAlgebra/src/LinearAlgebra.jl +++ b/stdlib/LinearAlgebra/src/LinearAlgebra.jl @@ -548,7 +548,8 @@ function versioninfo(io::IO=stdout) println(io, indent, "--> ", lib.libname, " (", interface, ")") end println(io, "Threading:") - println(io, indent, "Threads.nthreads() = ", Base.Threads.nthreads()) + println(io, indent, "Threads.threadpoolsize() = ", Threads.threadpoolsize()) + println(io, indent, "Threads.maxthreadid() = ", Base.Threads.maxthreadid()) println(io, indent, "LinearAlgebra.BLAS.get_num_threads() = ", BLAS.get_num_threads()) println(io, "Relevant environment variables:") env_var_names = [ diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index f260be3ae4dfe..f7cdb2ca39b84 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -102,8 +102,7 @@ function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} end delay_cur = ccall(:jl_profile_delay_nsec, UInt64, ())/10^9 if n === nothing && delay === nothing - nthreads = Sys.iswindows() ? 1 : Threads.nthreads() # windows only profiles the main thread - return round(Int, n_cur / nthreads), delay_cur + return n_cur, delay_cur end nnew = (n === nothing) ? n_cur : n delaynew = (delay === nothing) ? delay_cur : delay @@ -111,20 +110,17 @@ function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} end function init(n::Integer, delay::Real; limitwarn::Bool = true) - nthreads = Sys.iswindows() ? 1 : Threads.nthreads() # windows only profiles the main thread sample_size_bytes = sizeof(Ptr) # == Sys.WORD_SIZE / 8 - buffer_samples = n * nthreads + buffer_samples = n buffer_size_bytes = buffer_samples * sample_size_bytes if buffer_size_bytes > 2^29 && Sys.WORD_SIZE == 32 - buffer_size_bytes_per_thread = floor(Int, 2^29 / nthreads) - buffer_samples_per_thread = floor(Int, buffer_size_bytes_per_thread / sample_size_bytes) - buffer_samples = buffer_samples_per_thread * nthreads + buffer_samples = floor(Int, 2^29 / sample_size_bytes) buffer_size_bytes = buffer_samples * sample_size_bytes - limitwarn && @warn "Requested profile buffer limited to 512MB (n = $buffer_samples_per_thread per thread) given that this system is 32-bit" + limitwarn && @warn "Requested profile buffer limited to 512MB (n = $buffer_samples) given that this system is 32-bit" end - status = ccall(:jl_profile_init, Cint, (Csize_t, UInt64), buffer_samples, round(UInt64,10^9*delay)) + status = ccall(:jl_profile_init, Cint, (Csize_t, UInt64), buffer_samples, round(UInt64, 10^9*delay)) if status == -1 - error("could not allocate space for ", n, " instruction pointers per thread being profiled ($nthreads threads, $(Base.format_bytes(buffer_size_bytes)) total)") + error("could not allocate space for ", n, " instruction pointers ($(Base.format_bytes(buffer_size_bytes)))") end end @@ -427,7 +423,7 @@ function getdict!(dict::LineInfoDict, data::Vector{UInt}) n_unique_ips = length(unique_ips) n_unique_ips == 0 && return dict iplookups = similar(unique_ips, Vector{StackFrame}) - @sync for indexes_part in Iterators.partition(eachindex(unique_ips), div(n_unique_ips, Threads.nthreads(), RoundUp)) + @sync for indexes_part in Iterators.partition(eachindex(unique_ips), div(n_unique_ips, Threads.threadpoolsize(), RoundUp)) Threads.@spawn begin for i in indexes_part iplookups[i] = _lookup_corrected(unique_ips[i]) diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl index 86c391d573e50..27b35dc9c035b 100644 --- a/stdlib/Profile/test/runtests.jl +++ b/stdlib/Profile/test/runtests.jl @@ -120,11 +120,10 @@ end @testset "setting sample count and delay in init" begin n_, delay_ = Profile.init() n_original = n_ - nthreads = Sys.iswindows() ? 1 : Threads.nthreads() sample_size_bytes = sizeof(Ptr) def_n = Sys.iswindows() && Sys.WORD_SIZE == 32 ? 1_000_000 : 10_000_000 - if Sys.WORD_SIZE == 32 && (def_n * nthreads * sample_size_bytes) > 2^29 - @test n_ * nthreads * sample_size_bytes <= 2^29 + if Sys.WORD_SIZE == 32 && (def_n * sample_size_bytes) > 2^29 + @test n_ * sample_size_bytes <= 2^29 else @test n_ == def_n end @@ -133,8 +132,8 @@ end @test delay_ == def_delay Profile.init(n=1_000_001, delay=0.0005) n_, delay_ = Profile.init() - if Sys.WORD_SIZE == 32 && (1_000_001 * nthreads * sample_size_bytes) > 2^29 - @test n_ * nthreads * sample_size_bytes <= 2^29 + if Sys.WORD_SIZE == 32 && (1_000_001 * sample_size_bytes) > 2^29 + @test n_ * sample_size_bytes <= 2^29 else @test n_ == 1_000_001 end diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index cc76c6fcfb0c8..7063f1f87bf68 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -226,7 +226,7 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no` @test errors_not_signals(`$exename --cpu-target=invalidtarget`) # -t, --threads - code = "print(Threads.nthreads())" + code = "print(Threads.threadpoolsize())" cpu_threads = ccall(:jl_effective_threads, Int32, ()) @test string(cpu_threads) == read(`$exename --threads auto -e $code`, String) == @@ -254,7 +254,7 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no` # Combining --threads and --procs: --threads does propagate withenv("JULIA_NUM_THREADS" => nothing) do - code = "print(sum(remotecall_fetch(Threads.nthreads, x) for x in procs()))" + code = "print(sum(remotecall_fetch(Threads.threadpoolsize, x) for x in procs()))" @test read(`$exename -p2 -t2 -e $code`, String) == "6" end diff --git a/test/runtests.jl b/test/runtests.jl index 4c9ac1cfd869c..3227804cf7b47 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -127,7 +127,7 @@ cd(@__DIR__) do println(""" Running parallel tests with: nworkers() = $(nworkers()) - nthreads() = $(Threads.nthreads()) + nthreads() = $(Threads.threadpoolsize()) Sys.CPU_THREADS = $(Sys.CPU_THREADS) Sys.total_memory() = $(Base.format_bytes(Sys.total_memory())) Sys.free_memory() = $(Base.format_bytes(Sys.free_memory())) diff --git a/test/threads.jl b/test/threads.jl index 09e802757062b..fb684b275e864 100644 --- a/test/threads.jl +++ b/test/threads.jl @@ -124,7 +124,7 @@ end function get_nthreads(options = ``; cpus = nothing) cmd = `$(Base.julia_cmd()) --startup-file=no $(options)` - cmd = `$cmd -e "print(Threads.nthreads())"` + cmd = `$cmd -e "print(Threads.threadpoolsize())"` cmd = addenv(cmd, "JULIA_EXCLUSIVE" => "0", "JULIA_NUM_THREADS" => "auto") if cpus !== nothing cmd = setcpuaffinity(cmd, cpus) diff --git a/test/threads_exec.jl b/test/threads_exec.jl index 4bce3ebd71b41..68ba9377cf955 100644 --- a/test/threads_exec.jl +++ b/test/threads_exec.jl @@ -2,7 +2,7 @@ using Test using Base.Threads -using Base.Threads: SpinLock +using Base.Threads: SpinLock, threadpoolsize # for cfunction_closure include("testenv.jl") @@ -27,9 +27,12 @@ end # (expected test duration is about 18-180 seconds) Timer(t -> killjob("KILLING BY THREAD TEST WATCHDOG\n"), 1200) +@test Threads.threadid() == 1 +@test 1 <= threadpoolsize() <= Threads.maxthreadid() + # basic lock check -if nthreads() > 1 - let lk = Base.Threads.SpinLock() +if threadpoolsize() > 1 + let lk = SpinLock() c1 = Base.Event() c2 = Base.Event() @test trylock(lk) @@ -50,7 +53,7 @@ end # threading constructs -let a = zeros(Int, 2 * nthreads()) +let a = zeros(Int, 2 * threadpoolsize()) @threads for i = 1:length(a) @sync begin @async begin @@ -70,7 +73,7 @@ end # parallel loop with parallel atomic addition function threaded_loop(a, r, x) - counter = Threads.Atomic{Int}(min(Threads.nthreads(), length(r))) + counter = Threads.Atomic{Int}(min(threadpoolsize(), length(r))) @threads for i in r # synchronize the start given that each partition is started sequentially, # meaning that without the wait, if the loop is too fast the iteration can happen in order @@ -208,7 +211,7 @@ function threaded_gc_locked(::Type{LockT}) where LockT end threaded_gc_locked(SpinLock) -threaded_gc_locked(Threads.ReentrantLock) +threaded_gc_locked(ReentrantLock) # Issue 33159 # Make sure that a Threads.Condition can't be used without being locked, on any thread. @@ -423,7 +426,7 @@ end for T in intersect((Int32, Int64, Float32, Float64), Base.Threads.atomictypes) var = Atomic{T}() nloops = 1000 - di = nthreads() + di = threadpoolsize() @threads for i in 1:di test_atomic_cas!(var, i:di:nloops) end @@ -513,7 +516,7 @@ function test_thread_cfunction() @test cfs[1] == cf1 @test cfs[2] == cf(fs[2]) @test length(unique(cfs)) == 1000 - ok = zeros(Int, nthreads()) + ok = zeros(Int, threadpoolsize()) @threads :static for i in 1:10000 i = mod1(i, 1000) fi = fs[i] @@ -529,14 +532,14 @@ if cfunction_closure end function test_thread_range() - a = zeros(Int, nthreads()) + a = zeros(Int, threadpoolsize()) @threads for i in 1:threadid() a[i] = 1 end for i in 1:threadid() @test a[i] == 1 end - for i in (threadid() + 1):nthreads() + for i in (threadid() + 1):threadpoolsize() @test a[i] == 0 end end @@ -576,17 +579,17 @@ test_nested_loops() function test_thread_too_few_iters() x = Atomic() - a = zeros(Int, nthreads()+2) - threaded_loop(a, 1:nthreads()-1, x) - found = zeros(Bool, nthreads()+2) - for i=1:nthreads()-1 + a = zeros(Int, threadpoolsize()+2) + threaded_loop(a, 1:threadpoolsize()-1, x) + found = zeros(Bool, threadpoolsize()+2) + for i=1:threadpoolsize()-1 found[a[i]] = true end - @test x[] == nthreads()-1 + @test x[] == threadpoolsize()-1 # Next test checks that all loop iterations ran, # and were unique (via pigeon-hole principle). - @test !(false in found[1:nthreads()-1]) - @test !(true in found[nthreads():end]) + @test !(false in found[1:threadpoolsize()-1]) + @test !(true in found[threadpoolsize():end]) end test_thread_too_few_iters() @@ -728,10 +731,10 @@ function _atthreads_with_error(a, err) end a end -@test_throws CompositeException _atthreads_with_error(zeros(nthreads()), true) -let a = zeros(nthreads()) +@test_throws CompositeException _atthreads_with_error(zeros(threadpoolsize()), true) +let a = zeros(threadpoolsize()) _atthreads_with_error(a, false) - @test a == [1:nthreads();] + @test a == [1:threadpoolsize();] end # static schedule @@ -742,11 +745,11 @@ function _atthreads_static_schedule(n) end return ids end -@test _atthreads_static_schedule(nthreads()) == 1:nthreads() +@test _atthreads_static_schedule(threadpoolsize()) == 1:threadpoolsize() @test _atthreads_static_schedule(1) == [1;] @test_throws( "`@threads :static` cannot be used concurrently or nested", - @threads(for i = 1:1; _atthreads_static_schedule(nthreads()); end), + @threads(for i = 1:1; _atthreads_static_schedule(threadpoolsize()); end), ) # dynamic schedule @@ -759,35 +762,35 @@ function _atthreads_dynamic_schedule(n) end return inc[], flags end -@test _atthreads_dynamic_schedule(nthreads()) == (nthreads(), ones(nthreads())) +@test _atthreads_dynamic_schedule(threadpoolsize()) == (threadpoolsize(), ones(threadpoolsize())) @test _atthreads_dynamic_schedule(1) == (1, ones(1)) @test _atthreads_dynamic_schedule(10) == (10, ones(10)) -@test _atthreads_dynamic_schedule(nthreads() * 2) == (nthreads() * 2, ones(nthreads() * 2)) +@test _atthreads_dynamic_schedule(threadpoolsize() * 2) == (threadpoolsize() * 2, ones(threadpoolsize() * 2)) # nested dynamic schedule function _atthreads_dynamic_dynamic_schedule() inc = Threads.Atomic{Int}(0) - Threads.@threads :dynamic for _ = 1:nthreads() - Threads.@threads :dynamic for _ = 1:nthreads() + Threads.@threads :dynamic for _ = 1:threadpoolsize() + Threads.@threads :dynamic for _ = 1:threadpoolsize() Threads.atomic_add!(inc, 1) end end return inc[] end -@test _atthreads_dynamic_dynamic_schedule() == nthreads() * nthreads() +@test _atthreads_dynamic_dynamic_schedule() == threadpoolsize() * threadpoolsize() function _atthreads_static_dynamic_schedule() - ids = zeros(Int, nthreads()) + ids = zeros(Int, threadpoolsize()) inc = Threads.Atomic{Int}(0) - Threads.@threads :static for i = 1:nthreads() + Threads.@threads :static for i = 1:threadpoolsize() ids[i] = Threads.threadid() - Threads.@threads :dynamic for _ = 1:nthreads() + Threads.@threads :dynamic for _ = 1:threadpoolsize() Threads.atomic_add!(inc, 1) end end return ids, inc[] end -@test _atthreads_static_dynamic_schedule() == (1:nthreads(), nthreads() * nthreads()) +@test _atthreads_static_dynamic_schedule() == (1:threadpoolsize(), threadpoolsize() * threadpoolsize()) # errors inside @threads :dynamic function _atthreads_dynamic_with_error(a) @@ -796,7 +799,7 @@ function _atthreads_dynamic_with_error(a) end a end -@test_throws "user error in the loop body" _atthreads_dynamic_with_error(zeros(nthreads())) +@test_throws "user error in the loop body" _atthreads_dynamic_with_error(zeros(threadpoolsize())) try @macroexpand @threads(for i = 1:10, j = 1:10; end) @@ -1025,7 +1028,7 @@ function check_sync_end_race() nnotscheduled += y === :notscheduled end # Useful for tuning the test: - @debug "`check_sync_end_race` done" nthreads() ncompleted nnotscheduled nerror + @debug "`check_sync_end_race` done" threadpoolsize() ncompleted nnotscheduled nerror finally done[] = true end @@ -1039,21 +1042,21 @@ end # issue #41546, thread-safe package loading @testset "package loading" begin - ch = Channel{Bool}(nthreads()) + ch = Channel{Bool}(threadpoolsize()) barrier = Base.Event() old_act_proj = Base.ACTIVE_PROJECT[] try pushfirst!(LOAD_PATH, "@") Base.ACTIVE_PROJECT[] = joinpath(@__DIR__, "TestPkg") @sync begin - for _ in 1:nthreads() + for _ in 1:threadpoolsize() Threads.@spawn begin put!(ch, true) wait(barrier) @eval using TestPkg end end - for _ in 1:nthreads() + for _ in 1:threadpoolsize() take!(ch) end notify(barrier)