Skip to content

Commit

Permalink
Try to avoid julia becoming unkillable after fatal errors
Browse files Browse the repository at this point in the history
- don't smash the alt-stack when already using it
- handle jl_critical_error on the original stack, leaving our signal
handling thread free to handle more signals (and helping lock corruption
detection in some cases)
- unblock signals when handling signals: some libc apparently like to
block all signals, which can cause mild havoc, since we'd really like
the user or bad data to be able to still kill the process (and not just
be ignored or cause it to hang)
- reset signals to SIG_DFL earlier (so we recurse less)
- destroy some state from the Task we co-opted to run the exit handlers,
so that it can't accidentally jump back into the running program after
we've started tearing down the process, from an untimely ^C (previously
^C might cancel the exit) or a jlbacktrace call.
  • Loading branch information
vtjnash committed Mar 16, 2021
1 parent b4c79e7 commit e8c49fa
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 72 deletions.
2 changes: 1 addition & 1 deletion src/gf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1825,7 +1825,7 @@ static void JL_NORETURN jl_method_error_bare(jl_function_t *f, jl_value_t *args,
jl_static_show((JL_STREAM*)STDERR_FILENO,args); jl_printf((JL_STREAM*)STDERR_FILENO,"\n");
jl_ptls_t ptls = jl_get_ptls_states();
ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0);
jl_critical_error(0, NULL, ptls->bt_data, &ptls->bt_size);
jl_critical_error(0, NULL);
abort();
}
// not reached
Expand Down
2 changes: 1 addition & 1 deletion src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -913,7 +913,7 @@ size_t rec_backtrace_ctx(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t
size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t *ctx, jl_gcframe_t *pgcstack) JL_NOTSAFEPOINT;
#endif
JL_DLLEXPORT jl_value_t *jl_get_backtrace(void);
void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data, size_t *bt_size);
void jl_critical_error(int sig, bt_context_t *context);
JL_DLLEXPORT void jl_raise_debugger(void);
int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int noInline) JL_NOTSAFEPOINT;
JL_DLLEXPORT void jl_gdblookup(void* ip) JL_NOTSAFEPOINT;
Expand Down
41 changes: 35 additions & 6 deletions src/signal-handling.c
Original file line number Diff line number Diff line change
Expand Up @@ -231,15 +231,44 @@ void jl_show_sigill(void *_ctx)
#endif
}

// what to do on a critical error
void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data, size_t *bt_size)
// what to do on a critical error on a thread
void jl_critical_error(int sig, bt_context_t *context)
{
// This function is not allowed to reference any TLS variables.
// We need to explicitly pass in the TLS buffer pointer when
// we make `jl_filename` and `jl_lineno` thread local.

jl_ptls_t ptls = jl_get_ptls_states();
jl_bt_element_t *bt_data = ptls->bt_data;
size_t *bt_size = &ptls->bt_size;
size_t i, n = *bt_size;
if (sig)
if (sig) {
// kill this task, so that we cannot get back to it accidentally (via an untimely ^C or jlbacktrace in jl_exit)
ptls->pgcstack = NULL;
ptls->safe_restore = NULL;
if (ptls->current_task) {
ptls->current_task->eh = NULL;
ptls->current_task->excstack = NULL;
}
#ifndef _OS_WINDOWS_
sigset_t sset;
sigemptyset(&sset);
// n.b. In `abort()`, Apple's libSystem "helpfully" blocks all signals
// on all threads but SIGABRT. But we also don't know what the thread
// was doing, so unblock all critical signals so that they will crash
// hard, and not just get stuck.
sigaddset(&sset, SIGSEGV);
sigaddset(&sset, SIGBUS);
sigaddset(&sset, SIGILL);
// also unblock fatal signals now, so we won't get back here twice
sigaddset(&sset, SIGTERM);
sigaddset(&sset, SIGABRT);
sigaddset(&sset, SIGQUIT);
// and the original signal is now fatal too, in case it wasn't
// something already listed (?)
if (sig != SIGINT)
sigaddset(&sset, sig);
pthread_sigmask(SIG_UNBLOCK, &sset, NULL);
#endif
jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig));
}
jl_safe_printf("in expression starting at %s:%d\n", jl_filename, jl_lineno);
if (context) {
// Must avoid extended backtrace frames here unless we're sure bt_data
Expand Down
77 changes: 46 additions & 31 deletions src/signals-mach.c
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,7 @@ static void jl_throw_in_thread(int tid, mach_port_t thread, jl_value_t *exceptio
ptls2->sig_exception = exception;
}
jl_call_in_state(ptls2, &state, &jl_sig_throw);
ret = thread_set_state(thread, THREAD_STATE,
(thread_state_t)&state, count);
ret = thread_set_state(thread, THREAD_STATE, (thread_state_t)&state, count);
HANDLE_MACH_ERROR("thread_set_state", ret);
}

Expand All @@ -218,18 +217,16 @@ kern_return_t catch_exception_raise(mach_port_t exception_port,
exception_data_t code,
mach_msg_type_number_t code_count)
{
unsigned int count = THREAD_STATE_COUNT;
unsigned int exc_count = HOST_EXCEPTION_STATE_COUNT;
host_exception_state_t exc_state;
host_thread_state_t state;
#ifdef LLVMLIBUNWIND
if (thread == mach_profiler_thread) {
return profiler_segv_handler(exception_port, thread, task, exception, code, code_count);
}
#endif
int16_t tid;
jl_ptls_t ptls2 = NULL;
for (tid = 0;tid < jl_n_threads;tid++) {
for (tid = 0; tid < jl_n_threads; tid++) {
jl_ptls_t _ptls2 = jl_all_tls_states[tid];
if (pthread_mach_thread_np(_ptls2->system_id) == thread) {
ptls2 = _ptls2;
Expand Down Expand Up @@ -298,11 +295,8 @@ kern_return_t catch_exception_raise(mach_port_t exception_port,
return KERN_SUCCESS;
}
else {
kern_return_t ret = thread_get_state(thread, THREAD_STATE, (thread_state_t)&state, &count);
HANDLE_MACH_ERROR("thread_get_state", ret);
jl_critical_error(SIGSEGV, (unw_context_t*)&state,
ptls2->bt_data, &ptls2->bt_size);
return KERN_INVALID_ARGUMENT;
jl_exit_thread0(128 + SIGSEGV, NULL, 0);
return KERN_SUCCESS;
}
}

Expand All @@ -317,24 +311,27 @@ static void attach_exception_port(thread_port_t thread, int segv_only)
HANDLE_MACH_ERROR("thread_set_exception_ports", ret);
}

static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
static void jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
{
jl_ptls_t ptls2 = jl_all_tls_states[tid];
mach_port_t tid_port = pthread_mach_thread_np(ptls2->system_id);
mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);

kern_return_t ret = thread_suspend(tid_port);
kern_return_t ret = thread_suspend(thread);
HANDLE_MACH_ERROR("thread_suspend", ret);

// Do the actual sampling
unsigned int count = THREAD_STATE_COUNT;
static unw_context_t state;
memset(&state, 0, sizeof(unw_context_t));
memset(ctx, 0, sizeof(*ctx));

// Get the state of the suspended thread
ret = thread_get_state(tid_port, THREAD_STATE, (thread_state_t)&state, &count);
ret = thread_get_state(thread, THREAD_STATE, (thread_state_t)ctx, &count);
}

// Initialize the unwind context with the suspend thread's state
*ctx = &state;
static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
{
static host_thread_state_t state;
jl_thread_suspend_and_get_state2(tid, &state);
*ctx = (unw_context_t*)&state;
}

static void jl_thread_resume(int tid, int sig)
Expand Down Expand Up @@ -376,29 +373,45 @@ static void jl_try_deliver_sigint(void)
HANDLE_MACH_ERROR("thread_resume", ret);
}

static void jl_exit_thread0(int exitstate)
static void jl_exit_thread0_cb(int exitstate)
{
jl_critical_error(exitstate - 128, NULL);
jl_exit(exitstate);
}

static void jl_exit_thread0(int exitstate, jl_bt_element_t *bt_data, size_t bt_size)
{
jl_ptls_t ptls2 = jl_all_tls_states[0];
mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
kern_return_t ret = thread_suspend(thread);
HANDLE_MACH_ERROR("thread_suspend", ret);

host_thread_state_t state;
jl_thread_suspend_and_get_state2(0, &state);
unw_context_t *uc = (unw_context_t*)&state;

// This aborts `sleep` and other syscalls.
ret = thread_abort(thread);
kern_return_t ret = thread_abort(thread);
HANDLE_MACH_ERROR("thread_abort", ret);

unsigned int count = THREAD_STATE_COUNT;
host_thread_state_t state;
ret = thread_get_state(thread, THREAD_STATE,
(thread_state_t)&state, &count);
if (bt_data == NULL) {
// Must avoid extended backtrace frames here unless we're sure bt_data
// is properly rooted.
ptls2->bt_size = rec_backtrace_ctx(ptls2->bt_data, JL_MAX_BT_SIZE, uc, NULL);
}
else {
ptls2->bt_size = bt_size; // <= JL_MAX_BT_SIZE
memcpy(ptls2->bt_data, bt_data, ptls2->bt_size * sizeof(bt_data[0]));
}

void (*exit_func)(int) = &_exit;
if (thread0_exit_count <= 1) {
exit_func = &jl_exit;
exit_func = &jl_exit_thread0_cb;
}
else if (thread0_exit_count == 2) {
exit_func = &exit;
}
else {
exit_func = &_exit;
}

#ifdef _CPU_X86_64_
// First integer argument. Not portable but good enough =)
Expand All @@ -409,8 +422,8 @@ static void jl_exit_thread0(int exitstate)
#error Fill in first integer argument here
#endif
jl_call_in_state(ptls2, &state, (void (*)(void))exit_func);
ret = thread_set_state(thread, THREAD_STATE,
(thread_state_t)&state, count);
unsigned int count = THREAD_STATE_COUNT;
ret = thread_set_state(thread, THREAD_STATE, (thread_state_t)&state, count);
HANDLE_MACH_ERROR("thread_set_state", ret);

ret = thread_resume(thread);
Expand Down Expand Up @@ -508,8 +521,10 @@ void *mach_profile_listener(void *arg)
break;
}

unw_context_t *uc;
jl_thread_suspend_and_get_state(i, &uc);
host_thread_state_t state;
jl_thread_suspend_and_get_state2(i, &state);
unw_context_t *uc = (unw_context_t*)&state;

if (running) {
#ifdef LLVMLIBUNWIND
/*
Expand Down
73 changes: 45 additions & 28 deletions src/signals-unix.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ static bt_context_t *jl_to_bt_context(void *sigctx)
}

static int thread0_exit_count = 0;
static void jl_exit_thread0(int exitstate, jl_bt_element_t *bt_data, size_t bt_size);

static inline __attribute__((unused)) uintptr_t jl_get_rsp_from_ctx(const void *_ctx)
{
Expand Down Expand Up @@ -87,6 +88,13 @@ static inline __attribute__((unused)) uintptr_t jl_get_rsp_from_ctx(const void *
#endif
}

static int is_addr_on_sigstack(jl_ptls_t ptls, void *ptr)
{
// One guard page for signal_stack.
return !((char*)ptr < (char*)ptls->signal_stack - jl_page_size ||
(char*)ptr > (char*)ptls->signal_stack + sig_stack_size);
}

// Modify signal context `_ctx` so that `fptr` will execute when the signal
// returns. `fptr` will execute on the signal stack, and must not return.
static void jl_call_in_ctx(jl_ptls_t ptls, void (*fptr)(void), int sig, void *_ctx)
Expand All @@ -106,7 +114,13 @@ static void jl_call_in_ctx(jl_ptls_t ptls, void (*fptr)(void), int sig, void *_c
fptr();
return;
}
uintptr_t rsp = (uintptr_t)ptls->signal_stack + sig_stack_size;
uintptr_t rsp = jl_get_rsp_from_ctx(_ctx);
if (is_addr_on_sigstack(ptls, (void*)rsp)) {
rsp = (rsp - 256) & ~(uintptr_t)15; // redzone and re-alignment
}
else {
rsp = (uintptr_t)ptls->signal_stack + sig_stack_size;
}
assert(rsp % 16 == 0);
#if defined(_OS_LINUX_) && defined(_CPU_X86_64_)
ucontext_t *ctx = (ucontext_t*)_ctx;
Expand Down Expand Up @@ -208,16 +222,11 @@ static int is_addr_on_stack(jl_ptls_t ptls, void *addr)

static void sigdie_handler(int sig, siginfo_t *info, void *context)
{
jl_ptls_t ptls = jl_get_ptls_states();
sigset_t sset;
signal(sig, SIG_DFL);
uv_tty_reset_mode();
if (sig == SIGILL)
jl_show_sigill(context);
jl_critical_error(sig, jl_to_bt_context(context),
ptls->bt_data, &ptls->bt_size);
sigfillset(&sset);
sigprocmask(SIG_UNBLOCK, &sset, NULL);
signal(sig, SIG_DFL);
jl_critical_error(sig, jl_to_bt_context(context));
if (sig != SIGSEGV &&
sig != SIGBUS &&
sig != SIGILL) {
Expand Down Expand Up @@ -290,13 +299,6 @@ int is_write_fault(void *context) {
}
#endif

static int is_addr_on_sigstack(jl_ptls_t ptls, void *ptr)
{
// One guard page for signal_stack.
return !((char*)ptr < (char*)ptls->signal_stack - jl_page_size ||
(char*)ptr > (char*)ptls->signal_stack + sig_stack_size);
}

static int jl_is_on_sigstack(jl_ptls_t ptls, void *ptr, void *context)
{
return (is_addr_on_sigstack(ptls, ptr) &&
Expand Down Expand Up @@ -381,9 +383,8 @@ static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)

static void jl_thread_resume(int tid, int sig)
{
(void)sig;
jl_ptls_t ptls2 = jl_all_tls_states[tid];
jl_atomic_store_release(&ptls2->signal_request, 1);
jl_atomic_store_release(&ptls2->signal_request, sig == -1 ? 3 : 1);
pthread_cond_broadcast(&exit_signal_cond);
pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge
assert(jl_atomic_load_acquire(&ptls2->signal_request) == 0);
Expand Down Expand Up @@ -412,6 +413,7 @@ static void jl_exit_thread0_cb(void)
// (unavoidable due to its async nature).
// Try harder to exit each time if we get multiple exit requests.
if (thread0_exit_count <= 1) {
jl_critical_error(thread0_exit_state - 128, NULL);
jl_exit(thread0_exit_state);
}
else if (thread0_exit_count == 2) {
Expand All @@ -422,12 +424,23 @@ static void jl_exit_thread0_cb(void)
}
}

static void jl_exit_thread0(int state)
static void jl_exit_thread0(int state, jl_bt_element_t *bt_data, size_t bt_size)
{
jl_ptls_t ptls2 = jl_all_tls_states[0];
thread0_exit_state = state;
jl_atomic_store_release(&ptls2->signal_request, 3);
pthread_kill(ptls2->system_id, SIGUSR2);
if (thread0_exit_count <= 1) {
unw_context_t *signal_context;
jl_thread_suspend_and_get_state(0, &signal_context);
thread0_exit_state = state;
ptls2->bt_size = bt_size; // <= JL_MAX_BT_SIZE
memcpy(ptls2->bt_data, bt_data, ptls2->bt_size * sizeof(bt_data[0]));
jl_thread_resume(0, -1);
}
else {
thread0_exit_state = state;
jl_atomic_store_release(&ptls2->signal_request, 3);
// This also makes sure `sleep` is aborted.
pthread_kill(ptls2->system_id, SIGUSR2);
}
}

// request:
Expand All @@ -449,12 +462,10 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
pthread_cond_broadcast(&signal_caught_cond);
pthread_cond_wait(&exit_signal_cond, &in_signal_lock);
request = jl_atomic_exchange(&ptls->signal_request, 0);
assert(request == 1);
(void)request;
assert(request == 1 || request == 3);
pthread_cond_broadcast(&signal_caught_cond);
pthread_mutex_unlock(&in_signal_lock);
}
else
#endif
if (request == 2) {
int force = jl_check_force_sigint();
Expand Down Expand Up @@ -799,10 +810,16 @@ static void *signal_listener(void *arg)
// this part is async with the running of the rest of the program
// and must be thread-safe, but not necessarily signal-handler safe
if (critical) {
jl_critical_error(sig, NULL, bt_data, &bt_size);
if (doexit) {
thread0_exit_count++;
jl_exit_thread0(128 + sig);
jl_exit_thread0(128 + sig, bt_data, bt_size);
}
else {
jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig));
size_t i;
for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) {
jl_print_bt_entry_codeloc(bt_data + i);
}
}
}
}
Expand Down Expand Up @@ -874,7 +891,7 @@ void jl_install_default_signal_handlers(void)
memset(&act_die, 0, sizeof(struct sigaction));
sigemptyset(&act_die.sa_mask);
act_die.sa_sigaction = sigdie_handler;
act_die.sa_flags = SA_SIGINFO;
act_die.sa_flags = SA_SIGINFO | SA_RESETHAND;
if (sigaction(SIGILL, &act_die, NULL) < 0) {
jl_errorf("fatal error: sigaction: %s", strerror(errno));
}
Expand All @@ -885,7 +902,7 @@ void jl_install_default_signal_handlers(void)
jl_errorf("fatal error: sigaction: %s", strerror(errno));
}
// need to ensure the following signals are not SIG_IGN, even though they will be blocked
act_die.sa_flags = SA_SIGINFO | SA_RESTART;
act_die.sa_flags = SA_SIGINFO | SA_RESTART | SA_RESETHAND;
#if defined(HAVE_ITIMER)
if (sigaction(SIGPROF, &act_die, NULL) < 0) {
jl_errorf("fatal error: sigaction: %s", strerror(errno));
Expand Down
Loading

0 comments on commit e8c49fa

Please sign in to comment.