Skip to content

Commit

Permalink
fix deadlock misconfiguration
Browse files Browse the repository at this point in the history
There is not permitted to be either a safepoint or equivalently a
safepoint region transition while a lock is being held that might be
needed by GC as that forms a cycle. We must ensure that this thread is
permitted to keep running, so that it becomes possible for it to reach
the condition wait call and release the lock to keep in synchronized
with the notify_all in the sweep. Alternatively we could use a
timed_wait and continuous poll the GC (to ensure we actually run GC
periodically), but I would rather that be an internal GC policy than an
external forcing factor here. This prevents the GC from recruiting this
thread (via a signal) to help GC even though it is observably sleeping,
but we might consider later integrating a way for the GC to notify a set
of condition variables upon starting to wake them and recruit them
temporarily via a subsequent safepoint such as (in julia-syntax psuedocode):

    gc_safe_enter()
    lock(condvar) do
        while !condmet
            wait(condvar)
            while gc_running
                unlock(lock)
                unsafe_enter()
                gc_safepoint()
                unsafe_leave()
                lock(lock)
            end
        end
    end
    gc_safe_leave()
  • Loading branch information
vtjnash committed Jun 19, 2024
1 parent 847319f commit 940c2c3
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 10 deletions.
11 changes: 5 additions & 6 deletions src/engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner
ct->ptls->engine_nqueued++; // disables finalizers until inference is finished on this method graph
jl_code_instance_t *ci = jl_new_codeinst_uninit(m, owner); // allocate a placeholder
JL_GC_PUSH1(&ci);
int8_t gc_state = jl_gc_safe_enter(ct->ptls);
InferKey key = {m, owner};
std::unique_lock<std::mutex> lock(engine_lock);
auto tid = jl_atomic_load_relaxed(&ct->tid);
Expand All @@ -72,6 +73,8 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner
auto record = Reservations.find(key);
if (record == Reservations.end()) {
Reservations[key] = ReservationInfo{tid, ci};
lock.unlock();
jl_gc_safe_leave(ct->ptls, gc_state); // contains jl_gc_safepoint
JL_GC_POP();
return ci;
}
Expand All @@ -81,6 +84,8 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner
auto wait_tid = record->second.tid;
while (1) {
if (wait_tid == tid) {
lock.unlock();
jl_gc_safe_leave(ct->ptls, gc_state); // contains jl_gc_safepoint
JL_GC_POP();
ct->ptls->engine_nqueued--;
return ci; // break the cycle
Expand All @@ -96,11 +101,9 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner
assert(wait_tid != record2->second.tid);
wait_tid = record2->second.tid;
}
int8_t gc_state = jl_gc_safe_enter(ct->ptls);
Awaiting[tid] = key;
engine_wait.wait(lock);
Awaiting[tid] = InferKey{};
jl_gc_safe_leave(ct->ptls, gc_state); // contains jl_gc_safepoint
}
}

Expand Down Expand Up @@ -136,10 +139,6 @@ void jl_engine_sweep(jl_ptls_t *gc_all_tls_states)
engine_wait.notify_all();
}

void jl_engine_inhibit_finalizers(void)
{
}

void jl_engine_fulfill(jl_code_instance_t *ci, jl_code_info_t *src)
{
jl_task_t *ct = jl_current_task;
Expand Down
10 changes: 6 additions & 4 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,10 +352,12 @@ STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls,
return jl_gc_state_set(ptls, state, jl_atomic_load_relaxed(&ptls->gc_state));
}
#ifdef __clang_gcanalyzer__
int8_t jl_gc_unsafe_enter(jl_ptls_t ptls) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE; // this could be a safepoint, but we will assume it is not
void jl_gc_unsafe_leave(jl_ptls_t ptls, int8_t state) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
int8_t jl_gc_safe_enter(jl_ptls_t ptls) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
void jl_gc_safe_leave(jl_ptls_t ptls, int8_t state) JL_NOTSAFEPOINT_LEAVE; // this might not be a safepoint, but we have to assume it could be (statically)
// these might not be a safepoint (if they are no-op safe=>safe transitions), but we have to assume it could be (statically)
// however mark a delineated region in which safepoints would be not permissible
int8_t jl_gc_unsafe_enter(jl_ptls_t ptls) JL_NOTSAFEPOINT_LEAVE;
void jl_gc_unsafe_leave(jl_ptls_t ptls, int8_t state) JL_NOTSAFEPOINT_ENTER;
int8_t jl_gc_safe_enter(jl_ptls_t ptls) JL_NOTSAFEPOINT_ENTER;
void jl_gc_safe_leave(jl_ptls_t ptls, int8_t state) JL_NOTSAFEPOINT_LEAVE;
#else
#define jl_gc_unsafe_enter(ptls) jl_gc_state_save_and_set(ptls, JL_GC_STATE_UNSAFE)
#define jl_gc_unsafe_leave(ptls, state) ((void)jl_gc_state_set(ptls, (state), JL_GC_STATE_UNSAFE))
Expand Down

0 comments on commit 940c2c3

Please sign in to comment.