Skip to content

Commit

Permalink
Try to avoid julia becoming unkillable after fatal errors (#40056)
Browse files Browse the repository at this point in the history
- don't smash the alt-stack when already using it
- handle jl_critical_error on the original stack, leaving our signal
handling thread free to handle more signals (and helping lock corruption
detection in some cases)
- unblock signals when handling signals: some libc apparently like to
block all signals, which can cause mild havoc, since we'd really like
the user or bad data to be able to still kill the process (and not just
be ignored or cause it to hang)
- reset signals to SIG_DFL earlier (so we recurse less)
- destroy some state from the Task we co-opted to run the exit handlers,
so that it can't accidentally jump back into the running program after
we've started tearing down the process, from an untimely ^C (previously
^C might cancel the exit) or a jlbacktrace call.
- mark functions as leaf with CFI instead of (potentially) smashing the
stack, and add a bit of red-zone if we are recursing (to keep pgcstack
sensible)
- support safe_restore for the mach catch_exception_raise (while we're
trying to generate the backtrace)

(cherry picked from commit 107901d)
(cherry picked from commit f02a790)
  • Loading branch information
vtjnash authored and KristofferC committed Aug 25, 2021
1 parent a8327a3 commit 7166d86
Show file tree
Hide file tree
Showing 8 changed files with 252 additions and 140 deletions.
2 changes: 1 addition & 1 deletion src/gf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1817,7 +1817,7 @@ static void JL_NORETURN jl_method_error_bare(jl_function_t *f, jl_value_t *args,
jl_static_show((JL_STREAM*)STDERR_FILENO,args); jl_printf((JL_STREAM*)STDERR_FILENO,"\n");
jl_ptls_t ptls = jl_get_ptls_states();
ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0);
jl_critical_error(0, NULL, ptls->bt_data, &ptls->bt_size);
jl_critical_error(0, NULL);
abort();
}
// not reached
Expand Down
32 changes: 31 additions & 1 deletion src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,36 @@ void __tsan_switch_to_fiber(void *fiber, unsigned flags);
# define JL_USE_IFUNC 0
#endif

// If we've smashed the stack, (and not just normal NORETURN)
// this will smash stack-unwind too
#ifdef _OS_WINDOWS_
#if defined(_CPU_X86_64_)
// install the unhandled exception handler at the top of our stack
// to call directly into our personality handler
#define CFI_NORETURN \
asm volatile ("\t.seh_handler __julia_personality, @except\n\t.text");
#else
#define CFI_NORETURN
#endif
#else
// wipe out the call-stack unwind capability beyond this function
// (we are noreturn, so it is not a total lie)
#if defined(_CPU_X86_64_)
// per nongnu libunwind: "x86_64 ABI specifies that end of call-chain is marked with a NULL RBP or undefined return address"
// so we do all 3, to be extra certain of it
#define CFI_NORETURN \
asm volatile ("\t.cfi_undefined rip"); \
asm volatile ("\t.cfi_undefined rbp"); \
asm volatile ("\t.cfi_return_column rbp");
#else
// per nongnu libunwind: "DWARF spec says undefined return address location means end of stack"
// we use whatever happens to be register 1 on this platform for this
#define CFI_NORETURN \
asm volatile ("\t.cfi_undefined 1"); \
asm volatile ("\t.cfi_return_column 1");
#endif
#endif

// If this is detected in a backtrace of segfault, it means the functions
// that use this value must be reworked into their async form with cb arg
// provided and with JL_UV_LOCK used around the calls
Expand Down Expand Up @@ -904,7 +934,7 @@ size_t rec_backtrace_ctx(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t
size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t *ctx, jl_gcframe_t *pgcstack) JL_NOTSAFEPOINT;
#endif
JL_DLLEXPORT jl_value_t *jl_get_backtrace(void);
void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data, size_t *bt_size);
void jl_critical_error(int sig, bt_context_t *context);
JL_DLLEXPORT void jl_raise_debugger(void);
int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int noInline) JL_NOTSAFEPOINT;
JL_DLLEXPORT void jl_gdblookup(void* ip) JL_NOTSAFEPOINT;
Expand Down
41 changes: 35 additions & 6 deletions src/signal-handling.c
Original file line number Diff line number Diff line change
Expand Up @@ -231,15 +231,44 @@ void jl_show_sigill(void *_ctx)
#endif
}

// what to do on a critical error
void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data, size_t *bt_size)
// what to do on a critical error on a thread
void jl_critical_error(int sig, bt_context_t *context)
{
// This function is not allowed to reference any TLS variables.
// We need to explicitly pass in the TLS buffer pointer when
// we make `jl_filename` and `jl_lineno` thread local.

jl_ptls_t ptls = jl_get_ptls_states();
jl_bt_element_t *bt_data = ptls->bt_data;
size_t *bt_size = &ptls->bt_size;
size_t i, n = *bt_size;
if (sig)
if (sig) {
// kill this task, so that we cannot get back to it accidentally (via an untimely ^C or jlbacktrace in jl_exit)
ptls->pgcstack = NULL;
ptls->safe_restore = NULL;
if (ptls->current_task) {
ptls->current_task->eh = NULL;
ptls->current_task->excstack = NULL;
}
#ifndef _OS_WINDOWS_
sigset_t sset;
sigemptyset(&sset);
// n.b. In `abort()`, Apple's libSystem "helpfully" blocks all signals
// on all threads but SIGABRT. But we also don't know what the thread
// was doing, so unblock all critical signals so that they will crash
// hard, and not just get stuck.
sigaddset(&sset, SIGSEGV);
sigaddset(&sset, SIGBUS);
sigaddset(&sset, SIGILL);
// also unblock fatal signals now, so we won't get back here twice
sigaddset(&sset, SIGTERM);
sigaddset(&sset, SIGABRT);
sigaddset(&sset, SIGQUIT);
// and the original signal is now fatal too, in case it wasn't
// something already listed (?)
if (sig != SIGINT)
sigaddset(&sset, sig);
pthread_sigmask(SIG_UNBLOCK, &sset, NULL);
#endif
jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig));
}
jl_safe_printf("in expression starting at %s:%d\n", jl_filename, jl_lineno);
if (context) {
// Must avoid extended backtrace frames here unless we're sure bt_data
Expand Down
122 changes: 82 additions & 40 deletions src/signals-mach.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,16 @@ extern boolean_t exc_server(mach_msg_header_t *, mach_msg_header_t *);
void *mach_segv_listener(void *arg)
{
(void)arg;
(void)jl_get_ptls_states();
while (1) {
int ret = mach_msg_server(exc_server, 2048, segv_port, MACH_MSG_TIMEOUT_NONE);
jl_safe_printf("mach_msg_server: %s\n", mach_error_string(ret));
jl_exit(128 + SIGSEGV);
}
}

static void allocate_segv_handler()

static void allocate_mach_handler()
{
// ensure KEYMGR_GCC3_DW2_OBJ_LIST is initialized, as this requires malloc
// and thus can deadlock when used without first initializing it.
Expand Down Expand Up @@ -122,7 +124,7 @@ static void allocate_segv_handler()
jl_error("pthread_create failed");
}
pthread_attr_destroy(&attr);
for (int16_t tid = 0;tid < jl_n_threads;tid++) {
for (int16_t tid = 0; tid < jl_n_threads; tid++) {
attach_exception_port(pthread_mach_thread_np(jl_all_tls_states[tid]->system_id), 0);
}
}
Expand Down Expand Up @@ -164,19 +166,31 @@ typedef arm_exception_state64_t host_exception_state_t;
static void jl_call_in_state(jl_ptls_t ptls2, host_thread_state_t *state,
void (*fptr)(void))
{
uint64_t rsp = (uint64_t)ptls2->signal_stack + sig_stack_size;
#ifdef _CPU_X86_64_
uintptr_t rsp = state->__rsp;
#elif defined(_CPU_AARCH64_)
uintptr_t rsp = state->__sp;
#else
#error "julia: throw-in-context not supported on this platform"
#endif
if (ptls2->signal_stack == NULL || is_addr_on_sigstack(ptls2, (void*)rsp)) {
rsp = (rsp - 256) & ~(uintptr_t)15; // redzone and re-alignment
}
else {
rsp = (uintptr_t)ptls2->signal_stack + sig_stack_size;
}
assert(rsp % 16 == 0);

// push (null) $RIP onto the stack
rsp -= sizeof(void*);
*(void**)rsp = NULL;

#ifdef _CPU_X86_64_
rsp -= sizeof(void*);
state->__rsp = rsp; // set stack pointer
state->__rip = (uint64_t)fptr; // "call" the function
#else
#elif defined(_CPU_AARCH64_)
state->__sp = rsp;
state->__pc = (uint64_t)fptr;
state->__lr = 0;
#else
#error "julia: throw-in-context not supported on this platform"
#endif
}

Expand All @@ -194,11 +208,22 @@ static void jl_throw_in_thread(int tid, mach_port_t thread, jl_value_t *exceptio
ptls2->sig_exception = exception;
}
jl_call_in_state(ptls2, &state, &jl_sig_throw);
ret = thread_set_state(thread, THREAD_STATE,
(thread_state_t)&state, count);
ret = thread_set_state(thread, THREAD_STATE, (thread_state_t)&state, count);
HANDLE_MACH_ERROR("thread_set_state", ret);
}

static void segv_handler(int sig, siginfo_t *info, void *context)
{
jl_ptls_t ptls = jl_get_ptls_states();
assert(sig == SIGSEGV || sig == SIGBUS);
if (ptls->safe_restore) { // restarting jl_ or jl_unwind_stepn
jl_call_in_state(ptls, (host_thread_state_t*)jl_to_bt_context(context), &jl_sig_throw);
}
else {
sigdie_handler(sig, info, context);
}
}

//exc_server uses dlsym to find symbol
JL_DLLEXPORT
kern_return_t catch_exception_raise(mach_port_t exception_port,
Expand All @@ -208,18 +233,16 @@ kern_return_t catch_exception_raise(mach_port_t exception_port,
exception_data_t code,
mach_msg_type_number_t code_count)
{
unsigned int count = THREAD_STATE_COUNT;
unsigned int exc_count = HOST_EXCEPTION_STATE_COUNT;
host_exception_state_t exc_state;
host_thread_state_t state;
#ifdef LIBOSXUNWIND
#ifdef LLVMLIBUNWIND
if (thread == mach_profiler_thread) {
return profiler_segv_handler(exception_port, thread, task, exception, code, code_count);
}
#endif
int16_t tid;
jl_ptls_t ptls2 = NULL;
for (tid = 0;tid < jl_n_threads;tid++) {
for (tid = 0; tid < jl_n_threads; tid++) {
jl_ptls_t _ptls2 = jl_all_tls_states[tid];
if (pthread_mach_thread_np(_ptls2->system_id) == thread) {
ptls2 = _ptls2;
Expand Down Expand Up @@ -288,11 +311,8 @@ kern_return_t catch_exception_raise(mach_port_t exception_port,
return KERN_SUCCESS;
}
else {
kern_return_t ret = thread_get_state(thread, THREAD_STATE, (thread_state_t)&state, &count);
HANDLE_MACH_ERROR("thread_get_state", ret);
jl_critical_error(SIGSEGV, (unw_context_t*)&state,
ptls2->bt_data, &ptls2->bt_size);
return KERN_INVALID_ARGUMENT;
jl_exit_thread0(128 + SIGSEGV, NULL, 0);
return KERN_SUCCESS;
}
}

Expand All @@ -307,24 +327,27 @@ static void attach_exception_port(thread_port_t thread, int segv_only)
HANDLE_MACH_ERROR("thread_set_exception_ports", ret);
}

static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
static void jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
{
jl_ptls_t ptls2 = jl_all_tls_states[tid];
mach_port_t tid_port = pthread_mach_thread_np(ptls2->system_id);
mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);

kern_return_t ret = thread_suspend(tid_port);
kern_return_t ret = thread_suspend(thread);
HANDLE_MACH_ERROR("thread_suspend", ret);

// Do the actual sampling
unsigned int count = THREAD_STATE_COUNT;
static unw_context_t state;
memset(&state, 0, sizeof(unw_context_t));
memset(ctx, 0, sizeof(*ctx));

// Get the state of the suspended thread
ret = thread_get_state(tid_port, THREAD_STATE, (thread_state_t)&state, &count);
ret = thread_get_state(thread, THREAD_STATE, (thread_state_t)ctx, &count);
}

// Initialize the unwind context with the suspend thread's state
*ctx = &state;
static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
{
static host_thread_state_t state;
jl_thread_suspend_and_get_state2(tid, &state);
*ctx = (unw_context_t*)&state;
}

static void jl_thread_resume(int tid, int sig)
Expand Down Expand Up @@ -366,29 +389,46 @@ static void jl_try_deliver_sigint(void)
HANDLE_MACH_ERROR("thread_resume", ret);
}

static void jl_exit_thread0(int exitstate)
static void JL_NORETURN jl_exit_thread0_cb(int exitstate)
{
CFI_NORETURN
jl_critical_error(exitstate - 128, NULL);
jl_exit(exitstate);
}

static void jl_exit_thread0(int exitstate, jl_bt_element_t *bt_data, size_t bt_size)
{
jl_ptls_t ptls2 = jl_all_tls_states[0];
mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
kern_return_t ret = thread_suspend(thread);
HANDLE_MACH_ERROR("thread_suspend", ret);

host_thread_state_t state;
jl_thread_suspend_and_get_state2(0, &state);
unw_context_t *uc = (unw_context_t*)&state;

// This aborts `sleep` and other syscalls.
ret = thread_abort(thread);
kern_return_t ret = thread_abort(thread);
HANDLE_MACH_ERROR("thread_abort", ret);

unsigned int count = THREAD_STATE_COUNT;
host_thread_state_t state;
ret = thread_get_state(thread, THREAD_STATE,
(thread_state_t)&state, &count);
if (bt_data == NULL) {
// Must avoid extended backtrace frames here unless we're sure bt_data
// is properly rooted.
ptls2->bt_size = rec_backtrace_ctx(ptls2->bt_data, JL_MAX_BT_SIZE, uc, NULL);
}
else {
ptls2->bt_size = bt_size; // <= JL_MAX_BT_SIZE
memcpy(ptls2->bt_data, bt_data, ptls2->bt_size * sizeof(bt_data[0]));
}

void (*exit_func)(int) = &_exit;
if (thread0_exit_count <= 1) {
exit_func = &jl_exit;
exit_func = &jl_exit_thread0_cb;
}
else if (thread0_exit_count == 2) {
exit_func = &exit;
}
else {
exit_func = &_exit;
}

#ifdef _CPU_X86_64_
// First integer argument. Not portable but good enough =)
Expand All @@ -399,8 +439,8 @@ static void jl_exit_thread0(int exitstate)
#error Fill in first integer argument here
#endif
jl_call_in_state(ptls2, &state, (void (*)(void))exit_func);
ret = thread_set_state(thread, THREAD_STATE,
(thread_state_t)&state, count);
unsigned int count = THREAD_STATE_COUNT;
ret = thread_set_state(thread, THREAD_STATE, (thread_state_t)&state, count);
HANDLE_MACH_ERROR("thread_set_state", ret);

ret = thread_resume(thread);
Expand Down Expand Up @@ -498,8 +538,10 @@ void *mach_profile_listener(void *arg)
break;
}

unw_context_t *uc;
jl_thread_suspend_and_get_state(i, &uc);
host_thread_state_t state;
jl_thread_suspend_and_get_state2(i, &state);
unw_context_t *uc = (unw_context_t*)&state;

if (running) {
#ifdef LIBOSXUNWIND
/*
Expand Down
Loading

0 comments on commit 7166d86

Please sign in to comment.