Skip to content

Commit

Permalink
i#2601: fix thread exit race on attach (#2605)
Browse files Browse the repository at this point in the history
To handle a thread exiting on attach, adds a timeout to wait_event() and
its corresponding implementations: os_wait_event() on Windows and
ksynch_wait() on UNIX.  Uses the timeout to check whether a thread exited,
and if so, to move on.

Augments the test from #2600 to test this race.

Abandon the api.detach_spawn test on Windows:i#2611 covers fixing the
tricky problems on Windows.  Leaves in place some fixes toward #2611:
+ Fixes a race where we put interception_code hooks in place before marking
  them +x
+ Increases MAX_THREADS_WAITING_FOR_DR_INIT

Fixes clang 32-bit missing __moddi3 by adding it to third_party/libgcc and linking
that into x86 and arm.

To enable adding race checks, moves doing_detach inside the synchall and adds
started_detach for the few checks that need pre-synch querying.

Removes the dynamo_thread_init_during_process_exit flag that was added in
45dd931 for #2075, as the UNIX uninit_thread_count solution from #2600 solves
that problem on its own.

Fixes #2601
  • Loading branch information
derekbruening authored and fhahn committed Dec 4, 2017
1 parent b100d97 commit 94f565e
Show file tree
Hide file tree
Showing 23 changed files with 188 additions and 97 deletions.
8 changes: 7 additions & 1 deletion core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,15 @@ if (ARM)
add_libgcc_routines(divsi3)
add_libgcc_routines(modsi3)
include_directories(../third_party/libgcc/arm)
set(CORE_SRCS ${CORE_SRCS} ../third_party/libgcc/udivmoddi4.c)
endif (ARM)

if (UNIX)
# These are needed for ARM per i#1566 above, but are also needed in some cases
# for x86, such as 32-bit manipulation of 64-bit integers under clang where
# __moddi3 is needed.
set(CORE_SRCS ${CORE_SRCS} ../third_party/libgcc/udivmoddi4.c)
endif ()

if (WIN32 AND NOT X64)
# PR 219380: to avoid __ftol2_sse from libc
# FIXME: There is no supported way to suppress a "command line" warning
Expand Down
10 changes: 5 additions & 5 deletions core/arch/sideline.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2011-2015 Google, Inc. All rights reserved.
* Copyright (c) 2011-2017 Google, Inc. All rights reserved.
* Copyright (c) 2002-2009 VMware, Inc. All rights reserved.
* **********************************************************/

Expand Down Expand Up @@ -307,7 +307,7 @@ sideline_exit()
child_sleep = false;
signal_event(wake_event);
}
wait_for_event(exited_event);
wait_for_event(exited_event, 0);

#ifdef WINDOWS
/* wait for child to die */
Expand Down Expand Up @@ -423,7 +423,7 @@ sideline_stop()
child_sleep = true;
/* signal paused_for_sideline_event to prevent wait-forever */
signal_event(paused_for_sideline_event);
wait_for_event(asleep_event);
wait_for_event(asleep_event, 0);
}
}

Expand All @@ -444,7 +444,7 @@ sideline_run(void *arg)
if (child_sleep) {
LOG(logfile, LOG_SIDELINE, VERB_3, "SIDELINE: sideline thread going to sleep\n");
signal_event(asleep_event);
wait_for_event(wake_event);
wait_for_event(wake_event, 0);
continue;
}

Expand Down Expand Up @@ -605,7 +605,7 @@ sideline_optimize(fragment_t *f,
*/
fragment_now_optimizing = f;
mutex_unlock(&do_not_delete_lock);
wait_for_event(paused_for_sideline_event);
wait_for_event(paused_for_sideline_event, 0);
mutex_lock(&do_not_delete_lock);
/* if f was deleted, exit now */
if (fragment_now_optimizing == NULL)
Expand Down
2 changes: 1 addition & 1 deletion core/dispatch.c
Original file line number Diff line number Diff line change
Expand Up @@ -1141,7 +1141,7 @@ dispatch_exit_fcache(dcontext_t *dcontext)
"Thread %d waiting for sideline thread\n", tid);
signal_event(paused_for_sideline_event);
STATS_INC(num_wait_sideline);
wait_for_event(resume_from_sideline_event);
wait_for_event(resume_from_sideline_event, 0);
mutex_unlock(&sideline_lock);
LOG(THREAD, LOG_DISPATCH|LOG_THREADS|LOG_SIDELINE, 2,
"Thread %d resuming after sideline thread\n", tid);
Expand Down
16 changes: 8 additions & 8 deletions core/dynamo.c
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,7 @@ dynamorio_app_init(void)
if (INTERNAL_OPTION(unsafe_hang_process)) {
event_t never_signaled = create_event();
SYSLOG_INTERNAL_ERROR("Hanging the process deliberately!");
wait_for_event(never_signaled);
wait_for_event(never_signaled, 0);
destroy_event(never_signaled);
}

Expand Down Expand Up @@ -2154,9 +2154,6 @@ remove_thread(IF_WINDOWS_(HANDLE hthread) thread_id_t tid)
/* this bool is protected by reset_pending_lock */
DECLARE_FREQPROT_VAR(static bool reset_at_nth_thread_triggered, false);

#ifdef DEBUG
bool dynamo_thread_init_during_process_exit = false;
#endif
/* thread-specific initialization
* if dstack_in is NULL, then a dstack is allocated; else dstack_in is used
* as the thread's dstack
Expand Down Expand Up @@ -2201,15 +2198,18 @@ dynamo_thread_init(byte *dstack_in, priv_mcontext_t *mc
/* synch point so thread creation can be prevented for critical periods */
mutex_lock(&thread_initexit_lock);

/* XXX i#2611: during detach, there is a race where a thread can
* reach here on Windows despite init_apc_go_native (i#2600).
*/
ASSERT_BUG_NUM(2611, !doing_detach);

/* The assumption is that if dynamo_exited, then we are about to exit and
* clean up, initializing this thread then would be dangerous, better to
* wait here for the app to die. Is safe with detach, since a thread
* should never reach here when dynamo_exited is true during detach */
* wait here for the app to die.
*/
/* under current implementation of process exit, can happen only under
* debug build, or app_start app_exit interface */
while (dynamo_exited) {
/* FIXME i#2075: free the dstack. */
DODEBUG({dynamo_thread_init_during_process_exit = true; });
/* logging should be safe, though might not actually result in log
* message */
DODEBUG_ONCE(LOG(GLOBAL, LOG_THREADS, 1,
Expand Down
6 changes: 3 additions & 3 deletions core/fragment.c
Original file line number Diff line number Diff line change
Expand Up @@ -5632,7 +5632,7 @@ wait_for_flusher_nolinking(dcontext_t *dcontext)
dcontext->owning_thread, flusher->owning_thread, flushtime_global);
mutex_unlock(&pt->linking_lock);
STATS_INC(num_wait_flush);
wait_for_event(pt->finished_all_unlink);
wait_for_event(pt->finished_all_unlink, 0);
LOG(THREAD, LOG_DISPATCH|LOG_THREADS, 2,
"Thread "TIDFMT" resuming after flush\n", dcontext->owning_thread);
mutex_lock(&pt->linking_lock);
Expand All @@ -5655,7 +5655,7 @@ wait_for_flusher_linking(dcontext_t *dcontext)
mutex_unlock(&pt->linking_lock);
signal_event(pt->waiting_for_unlink);
STATS_INC(num_wait_flush);
wait_for_event(pt->finished_with_unlink);
wait_for_event(pt->finished_with_unlink, 0);
LOG(THREAD, LOG_DISPATCH|LOG_THREADS, 2,
"Thread "TIDFMT" resuming after flush\n", dcontext->owning_thread);
mutex_lock(&pt->linking_lock);
Expand Down Expand Up @@ -6441,7 +6441,7 @@ flush_fragments_synch_priv(dcontext_t *dcontext, app_pc base, size_t size,
"\twaiting for thread "TIDFMT"\n", tgt_dcontext->owning_thread);
tgt_pt->wait_for_unlink = true;
mutex_unlock(&tgt_pt->linking_lock);
wait_for_event(tgt_pt->waiting_for_unlink);
wait_for_event(tgt_pt->waiting_for_unlink, 0);
mutex_lock(&tgt_pt->linking_lock);
tgt_pt->wait_for_unlink = false;
LOG(THREAD, LOG_FRAGMENT, 2,
Expand Down
2 changes: 0 additions & 2 deletions core/globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -450,8 +450,6 @@ extern bool dynamo_exited_all_other_threads; /* has dynamo exited and synched?
extern bool dynamo_exited_and_cleaned; /* has dynamo component cleanup started? */
#ifdef DEBUG
extern bool dynamo_exited_log_and_stats; /* are stats and logfile shut down? */
/* process exit in middle of any thread init? */
extern bool dynamo_thread_init_during_process_exit;
#endif
extern bool dynamo_resetting; /* in middle of global reset? */
extern bool dynamo_all_threads_synched; /* are all other threads suspended safely? */
Expand Down
4 changes: 0 additions & 4 deletions core/heap.c
Original file line number Diff line number Diff line change
Expand Up @@ -1476,10 +1476,6 @@ vmm_heap_exit()
detach
*/
ASSERT(IF_WINDOWS(doing_detach || ) /* not deterministic when detaching */
/* FIXME i#2075: if a thread is being initialized during the
* process exit, the thread will not free its dstack.
*/
IF_DEBUG(dynamo_thread_init_during_process_exit || )
heapmgt->vmheap.num_free_blocks == heapmgt->vmheap.num_blocks
- unfreed_blocks ||
/* >=, not ==, b/c if we hit the vmm limit the cur dstack
Expand Down
2 changes: 1 addition & 1 deletion core/lib/instrument.c
Original file line number Diff line number Diff line change
Expand Up @@ -3627,7 +3627,7 @@ dr_event_wait(void *event)
dcontext_t *dcontext = get_thread_private_dcontext();
if (IS_CLIENT_THREAD(dcontext))
dcontext->client_data->client_thread_safe_for_synch = true;
wait_for_event((event_t)event);
wait_for_event((event_t)event, 0);
if (IS_CLIENT_THREAD(dcontext))
dcontext->client_data->client_thread_safe_for_synch = false;
return true;
Expand Down
3 changes: 2 additions & 1 deletion core/os_shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,8 @@ event_t create_event(void);
void destroy_event(event_t e);
void signal_event(event_t e);
void reset_event(event_t e);
void wait_for_event(event_t e);
/* 0 means to wait forever. Returns false on timeout, true on event firing. */
bool wait_for_event(event_t e, int timeout_ms);

/* get current timer frequency in KHz */
/* NOTE: Keep in mind that with voltage scaling in power saving mode
Expand Down
25 changes: 15 additions & 10 deletions core/synch.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@

extern vm_area_vector_t *fcache_unit_areas; /* from fcache.c */

static bool started_detach = false; /* set before synchall */
bool doing_detach = false; /* set after synchall */

static void
synch_thread_yield(void);

Expand Down Expand Up @@ -666,7 +669,7 @@ check_wait_at_safe_spot(dcontext_t *dcontext, thread_synch_permission_t cur_stat
tsd->synch_perm != THREAD_SYNCH_NONE) {
STATS_INC_DC(dcontext, synch_loops_wait_safe);
#ifdef WINDOWS
if (doing_detach) {
if (started_detach) {
/* We spin for any non-detach synchs encountered during detach
* since we have no flag telling us this synch is for detach. */
/* Ref case 5074, can NOT use os_thread_yield here. This must be a user
Expand All @@ -689,7 +692,7 @@ check_wait_at_safe_spot(dcontext_t *dcontext, thread_synch_permission_t cur_stat
ENTERING_DR();
tsd->synch_perm = THREAD_SYNCH_NONE;
if (tsd->set_mcontext != NULL || tsd->set_context != NULL) {
IF_WINDOWS(ASSERT(!doing_detach));
IF_WINDOWS(ASSERT(!started_detach));
/* Make a local copy */
ASSERT(sizeof(cxt) >= sizeof(priv_mcontext_t));
if (tsd->set_mcontext != NULL) {
Expand Down Expand Up @@ -1167,7 +1170,7 @@ synch_with_all_threads(thread_synch_state_t desired_synch_state,
*/
ASSERT_CURIOSITY(desired_synch_state < THREAD_SYNCH_SUSPENDED_VALID_MCONTEXT
/* detach currently violates this: bug 8942 */
|| doing_detach);
|| started_detach);

/* must set exactly one of these -- FIXME: better way to check? */
ASSERT(TESTANY(THREAD_SYNCH_SUSPEND_FAILURE_ABORT |
Expand Down Expand Up @@ -1782,8 +1785,6 @@ translate_from_synchall_to_dispatch(thread_record_t *tr, thread_synch_state_t sy
* Detach and similar operations
*/

bool doing_detach = false;

/* Atomic variable to prevent multiple threads from trying to detach at
* the same time.
*/
Expand Down Expand Up @@ -1932,17 +1933,17 @@ detach_on_permanent_stack(bool internal, bool do_cleanup)

/* Unprotect .data for exit cleanup.
* XXX: more secure to not do this until we've synched, but then need
* alternative prot for doing_detach and init_apc_go_native*
* alternative prot for started_detach and init_apc_go_native*
*/
SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);

ASSERT(!doing_detach);
doing_detach = true;
ASSERT(!started_detach);
started_detach = true;

if (!internal) {
synchronize_dynamic_options();
if (!DYNAMO_OPTION(allow_detach)) {
doing_detach = false;
started_detach = false;
SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
dynamo_detaching_flag = LOCK_FREE_STATE;
SYSLOG_INTERNAL_ERROR("Detach called without the allow_detach option set");
Expand Down Expand Up @@ -1973,7 +1974,7 @@ detach_on_permanent_stack(bool internal, bool do_cleanup)
*/
init_apc_go_native_pause = true;
init_apc_go_native = true;
/* XXX i#2600: there is still a race for threads caught between init_apc_go_native
/* XXX i#2611: there is still a race for threads caught between init_apc_go_native
* and dynamo_thread_init adding to all_threads: this just reduces the risk.
* Unfortunately we can't easily use the UNIX solution of uninit_thread_count
* since we can't distinguish internally vs externally created threads.
Expand All @@ -2000,6 +2001,9 @@ detach_on_permanent_stack(bool internal, bool do_cleanup)
ASSERT(mutex_testlock(&all_threads_synch_lock) &&
mutex_testlock(&thread_initexit_lock));

ASSERT(!doing_detach);
doing_detach = true;

#ifdef HOT_PATCHING_INTERFACE
/* In hotp_only mode, we must remove patches when detaching; we don't want
* to leave in all our hooks and detach; that will definitely crash the app.
Expand Down Expand Up @@ -2178,6 +2182,7 @@ detach_on_permanent_stack(bool internal, bool do_cleanup)
dynamo_exit_post_detach();

doing_detach = false;
started_detach = false;

SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
dynamo_detaching_flag = LOCK_FREE_STATE;
Expand Down
2 changes: 1 addition & 1 deletion core/unix/ksynch.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ mutex_get_contended_event(mutex_t *lock);
/* These return 0 on success: */

ptr_int_t
ksynch_wait(KSYNCH_TYPE *var, int mustbe);
ksynch_wait(KSYNCH_TYPE *var, int mustbe, int timeout_ms);

ptr_int_t
ksynch_wake(KSYNCH_TYPE *var);
Expand Down
15 changes: 10 additions & 5 deletions core/unix/ksynch_linux.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* *******************************************************************************
* Copyright (c) 2010-2013 Google, Inc. All rights reserved.
* Copyright (c) 2010-2017 Google, Inc. All rights reserved.
* Copyright (c) 2011 Massachusetts Institute of Technology All rights reserved.
* Copyright (c) 2000-2010 VMware, Inc. All rights reserved.
* *******************************************************************************/
Expand Down Expand Up @@ -45,6 +45,7 @@
#include "../globals.h"
#include "ksynch.h"
#include <linux/futex.h> /* for futex op code */
#include <sys/time.h>
#include "include/syscall.h" /* our own local copy */

#ifndef LINUX
Expand Down Expand Up @@ -107,18 +108,22 @@ ksynch_free_var(volatile int *futex)

/* Waits on the futex until woken if the kernel supports SYS_futex syscall
* and the futex's value has not been changed from mustbe. Does not block
* if the kernel doesn't support SYS_futex. Returns 0 if woken by another thread,
* if the kernel doesn't support SYS_futex. If timeout_ms is 0, there is no timeout;
* else returns a negative value on a timeout. Returns 0 if woken by another thread,
* and negative value for all other cases.
*/
ptr_int_t
ksynch_wait(volatile int *futex, int mustbe)
ksynch_wait(volatile int *futex, int mustbe, int timeout_ms)
{
ptr_int_t res;
ASSERT(ALIGNED(futex, sizeof(int)));
if (kernel_futex_support) {
/* XXX: Having debug timeout like win32 os_wait_event() would be useful */
res = dynamorio_syscall(SYS_futex, 6, futex, FUTEX_WAIT, mustbe, NULL,
NULL, 0);
struct timespec timeout;
timeout.tv_sec = (timeout_ms / 1000);
timeout.tv_nsec = ((int64)timeout_ms % 1000) * 1000000;
res = dynamorio_syscall(SYS_futex, 6, futex, FUTEX_WAIT, mustbe,
timeout_ms > 0 ? &timeout : NULL, NULL, 0);
} else {
res = -1;
}
Expand Down
13 changes: 10 additions & 3 deletions core/unix/ksynch_macos.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* *******************************************************************************
* Copyright (c) 2013-2014 Google, Inc. All rights reserved.
* Copyright (c) 2013-2017 Google, Inc. All rights reserved.
* *******************************************************************************/

/*
Expand Down Expand Up @@ -114,10 +114,17 @@ ksynch_set_value(mac_synch_t *synch, int new_val)
}

ptr_int_t
ksynch_wait(mac_synch_t *synch, int mustbe)
ksynch_wait(mac_synch_t *synch, int mustbe, int timeout_ms)
{
/* We don't need to bother with "mustbe" b/c of SYNC_POLICY_PREPOST */
kern_return_t res = semaphore_wait(synch->sem);
kern_return_t res;
if (timeout_ms > 0) {
mach_timespec_t timeout;
timeout.tv_sec = (timeout_ms / 1000);
timeout.tv_nsec = ((int64)timeout_ms % 1000) * 1000000;
res = semaphore_timedwait(synch->sem, timeout);
} else
res = semaphore_wait(synch->sem);
return (res == KERN_SUCCESS ? 0 : -1);
}

Expand Down
16 changes: 11 additions & 5 deletions core/unix/memcache.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* *******************************************************************************
* Copyright (c) 2010-2016 Google, Inc. All rights reserved.
* Copyright (c) 2010-2017 Google, Inc. All rights reserved.
* Copyright (c) 2011 Massachusetts Institute of Technology All rights reserved.
* Copyright (c) 2000-2010 VMware, Inc. All rights reserved.
* *******************************************************************************/
Expand Down Expand Up @@ -326,10 +326,16 @@ void
memcache_update_locked(app_pc start, app_pc end, uint prot, int type, bool exists)
{
memcache_lock();
ASSERT(!exists ||
vmvector_overlap(all_memory_areas, start, end) ||
/* we could synch up: instead we relax the assert if DR areas not in allmem */
are_dynamo_vm_areas_stale());
/* A curiosity as it can happen when attaching to a many-threaded
* app (e.g., the api.detach_spawn test), or when dr_app_setup is
* separate from dr_app_start (i#2037).
*/
ASSERT_CURIOSITY(!exists ||
vmvector_overlap(all_memory_areas, start, end) ||
/* we could synch up: instead we relax the assert if DR areas not
* in allmem
*/
are_dynamo_vm_areas_stale());
LOG(GLOBAL, LOG_VMAREAS, 3, "\tupdating all_memory_areas "PFX"-"PFX" prot->%d\n",
start, end, prot);
memcache_update(start, end, prot, type);
Expand Down
Loading

0 comments on commit 94f565e

Please sign in to comment.