From 082d7d2544e8e080d5f260c94d093480ee72e360 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Thu, 20 Jul 2017 17:12:17 -0400 Subject: [PATCH 1/2] allow tasks to request dedicated stack space when created never copy over the root stack: this is a hybrid approach to COPY_STACK where the root task is never moved or copied, and all other task stacks are layered into the same memory area (ptls->basestack + basesize) several strategies exist for making new stacks: ucontext_t (where it is available, aka linux) unw_context_t (as an alternative to ucontext_t that avoids a syscall on task-switch) makecontext (as a posix standard implemention) setjmp/longjmp-based implementation (for systems where this is sufficient) Windows Fibers (implemented here, since we can be more efficient and reliable than the official Fibers API) also, uses an alternate stack for use in collecting stack-overflow backtraces like posix, but managed manually --- base/boot.jl | 4 +- src/Makefile | 8 +- src/gc-debug.c | 10 +- src/gc-pages.c | 1 - src/gc-stacks.c | 177 +++++++ src/gc.c | 35 +- src/gc.h | 1 + src/init.c | 53 +- src/julia.h | 16 +- src/julia_internal.h | 5 +- src/julia_threads.h | 51 +- src/options.h | 6 +- src/signals-unix.c | 14 +- src/signals-win.c | 101 ++-- src/support/END.h | 10 +- src/support/ENTRY.amd64.h | 2 + src/support/Makefile | 8 +- src/support/_longjmp.win32.S | 68 --- src/support/_longjmp.win64.S | 29 -- src/support/_setjmp.win32.S | 98 +++- src/support/_setjmp.win64.S | 128 ++++- src/support/win32_ucontext.c | 89 ++++ src/support/win32_ucontext.h | 27 + src/task.c | 971 ++++++++++++++++++++--------------- src/threading.c | 8 +- test/stack_overflow.jl | 19 + 26 files changed, 1259 insertions(+), 680 deletions(-) create mode 100644 src/gc-stacks.c delete mode 100644 src/support/_longjmp.win32.S delete mode 100644 src/support/_longjmp.win64.S create mode 100644 src/support/win32_ucontext.c create mode 100644 src/support/win32_ucontext.h create mode 100644 test/stack_overflow.jl diff --git a/base/boot.jl b/base/boot.jl index 1cffd0107a7e2..245f655801388 100644 --- a/base/boot.jl +++ b/base/boot.jl @@ -374,7 +374,9 @@ eval(Core, :(LineInfoNode(mod::Module, method::Symbol, file::Symbol, line::Int, Module(name::Symbol=:anonymous, std_imports::Bool=true) = ccall(:jl_f_new_module, Ref{Module}, (Any, Bool), name, std_imports) -Task(@nospecialize(f)) = ccall(:jl_new_task, Ref{Task}, (Any, Int), f, 0) +function Task(@nospecialize(f), reserved_stack::Int=0) + return ccall(:jl_new_task, Ref{Task}, (Any, Int), f, reserved_stack) +end # simple convert for use by constructors of types in Core # note that there is no actual conversion defined here, diff --git a/src/Makefile b/src/Makefile index 20da34dd99d55..ea0a51d62a3a1 100644 --- a/src/Makefile +++ b/src/Makefile @@ -42,7 +42,7 @@ SRCS := \ jltypes gf typemap ast builtins module interpreter symbol \ dlload sys init task array dump staticdata toplevel jl_uv datatype \ simplevector APInt-C runtime_intrinsics runtime_ccall precompile \ - threadgroup threading stackwalk gc gc-debug gc-pages method \ + threadgroup threading stackwalk gc gc-debug gc-pages gc-stacks method \ jlapi signal-handling safepoint jloptions timing subtype rtutils \ crc32c processor @@ -207,14 +207,14 @@ $(addprefix $(BUILDDIR)/,threading.o threading.dbg.obj gc.o gc.dbg.obj init.c in $(addprefix $(BUILDDIR)/,APInt-C.o APInt-C.dbg.obj runtime_intrinsics.o runtime_intrinsics.dbg.obj): $(SRCDIR)/APInt-C.h # archive library file rules -$(BUILDDIR)/support/libsupport.a: $(SRCDIR)/support/*.h $(SRCDIR)/support/*.c +$(BUILDDIR)/support/libsupport.a: $(addprefix $(SRCDIR)/support/,*.h *.c *.S) $(SRCDIR)/support/*.c $(MAKE) -C $(SRCDIR)/support BUILDDIR='$(abspath $(BUILDDIR)/support)' -$(BUILDDIR)/support/libsupport-debug.a: $(SRCDIR)/support/*.h $(SRCDIR)/support/*.c +$(BUILDDIR)/support/libsupport-debug.a: $(addprefix $(SRCDIR)/support/,*.h *.c *.S) $(SRCDIR)/support/*.c $(MAKE) -C $(SRCDIR)/support debug BUILDDIR='$(abspath $(BUILDDIR)/support)' $(FLISP_EXECUTABLE_release): $(BUILDDIR)/flisp/libflisp.a -$(BUILDDIR)/flisp/libflisp.a: $(addprefix $(SRCDIR)/,flisp/*.h flisp/*.c) $(BUILDDIR)/support/libsupport.a +$(BUILDDIR)/flisp/libflisp.a: $(addprefix $(SRCDIR)/flisp/,*.h *.c) $(BUILDDIR)/support/libsupport.a $(MAKE) -C $(SRCDIR)/flisp BUILDDIR='$(abspath $(BUILDDIR)/flisp)' $(FLISP_EXECUTABLE_debug): $(BUILDDIR)/flisp/libflisp-debug.a diff --git a/src/gc-debug.c b/src/gc-debug.c index ddc369b9d5643..78709118aa910 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -595,11 +595,11 @@ static void gc_scrub_task(jl_task_t *ta) #else jl_task_t *thread_task = ptls2->root_task; #endif - if (ta == thread_task) - gc_scrub_range(ptls2->stack_lo, ptls2->stack_hi); - if (ta->stkbuf == (void*)(intptr_t)(-1) || !ta->stkbuf) - return; - gc_scrub_range((char*)ta->stkbuf, (char*)ta->stkbuf + ta->ssize); + void *stkbuf = ta->stkbuf; + if (ta == thread_task && ptls->copy_stack) + gc_scrub_range(ptls2->stackbase, ptls2->stacksize); + else if (stkbuf) + gc_scrub_range((char*)stkbuf, (char*)stkbuf + ta->bufsz); } void gc_scrub(void) diff --git a/src/gc-pages.c b/src/gc-pages.c index 60c89fd977c5b..b0b3feca52c6e 100644 --- a/src/gc-pages.c +++ b/src/gc-pages.c @@ -4,7 +4,6 @@ #ifndef _OS_WINDOWS_ # include #endif -#include "julia_assert.h" #ifdef __cplusplus extern "C" { diff --git a/src/gc-stacks.c b/src/gc-stacks.c new file mode 100644 index 0000000000000..8515fb7797be6 --- /dev/null +++ b/src/gc-stacks.c @@ -0,0 +1,177 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#include "gc.h" +#ifndef _OS_WINDOWS_ +# include +#endif + +const size_t jl_guard_size = (4096 * 16); + +#ifdef _OS_WINDOWS_ +#define MAP_FAILED NULL +static void *malloc_stack(size_t bufsz) +{ + void *stk = VirtualAlloc(NULL, bufsz, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + if (stk == NULL) + return MAP_FAILED; + DWORD dwOldProtect; + if (!VirtualProtect(stk, jl_guard_size, PAGE_READWRITE | PAGE_GUARD, &dwOldProtect)) { + VirtualFree(stk, 0, MEM_RELEASE); + return MAP_FAILED; + } + return stk; +} + + +static void free_stack(void *stkbuf, size_t bufsz) +{ + VirtualFree(stkbuf, 0, MEM_RELEASE); +} + +#else + +static void *malloc_stack(size_t bufsz) +{ + void* stk = mmap(0, bufsz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (stk == MAP_FAILED) + return MAP_FAILED; +#if !defined(JL_HAVE_UCONTEXT) && !defined(JL_HAVE_SIGALTSTACK) + // setup a guard page to detect stack overflow + if (mprotect(stk, jl_guard_size, PROT_NONE) == -1) { + munmap(stk, bufsz); + return MAP_FAILED; + } +#endif + return stk; +} + +static void free_stack(void *stkbuf, size_t bufsz) +{ + munmap(stkbuf, bufsz); +} +#endif + + +const unsigned pool_sizes[] = { + 128 * 1024, + 192 * 1024, + 256 * 1024, + 384 * 1024, + 512 * 1024, + 768 * 1024, + 1024 * 1024, + 1537 * 1024, + 2048 * 1024, + 3 * 1024 * 1024, + 4 * 1024 * 1024, + 6 * 1024 * 1024, + 8 * 1024 * 1024, + 12 * 1024 * 1024, + 16 * 1024 * 1024, + 24 * 1024 * 1024, +}; + +static_assert(sizeof(pool_sizes) == JL_N_STACK_POOLS * sizeof(pool_sizes[0]), "JL_N_STACK_POOLS size mismatch"); + +static unsigned select_pool(size_t nb) +{ + unsigned pool_id = 0; + while (pool_sizes[pool_id] < nb) + pool_id++; + return pool_id; +} + + +static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) +{ + if (bufsz <= pool_sizes[JL_N_STACK_POOLS - 1]) { + unsigned pool_id = select_pool(bufsz); + if (pool_sizes[pool_id] == bufsz) { + arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf); + return; + } + } + free_stack(stkbuf, bufsz); +} + + +JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz) +{ + _jl_free_stack(jl_get_ptls_states(), stkbuf, bufsz); +} + + +JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) +{ + jl_ptls_t ptls = jl_get_ptls_states(); + size_t ssize = *bufsz; + void *stk = NULL; + if (ssize <= pool_sizes[JL_N_STACK_POOLS - 1]) { + unsigned pool_id = select_pool(ssize); + ssize = pool_sizes[pool_id]; + arraylist_t *pool = &ptls->heap.free_stacks[pool_id]; + if (pool->len > 0) { + stk = arraylist_pop(pool); + } + } + else { + ssize = LLT_ALIGN(ssize, jl_page_size); + } + if (stk == NULL) { + // TODO: allocate blocks of stacks? but need to mprotect individually anyways + stk = malloc_stack(ssize); + if (stk == MAP_FAILED) + jl_throw(jl_memory_exception); + } + *bufsz = ssize; + if (owner) { + arraylist_t *live_tasks = &ptls->heap.live_tasks; + arraylist_push(live_tasks, owner); + } + return stk; +} + +void sweep_stack_pools(void) +{ +// TODO: deallocate stacks if we have too many sitting around unused +// for (stk in halfof(free_stacks)) +// free_stack(stk, pool_sz); +// // then sweep the task stacks +// for (t in live_tasks) +// if (!gc-marked(t)) +// stkbuf = t->stkbuf +// bufsz = t->bufsz +// if (stkbuf) +// push(free_stacks[sz], stkbuf) + for (int i = 0; i < jl_n_threads; i++) { + jl_ptls_t ptls2 = jl_all_tls_states[i]; + arraylist_t *live_tasks = &ptls2->heap.live_tasks; + size_t n = 0; + size_t ndel = 0; + size_t l = live_tasks->len; + void **lst = live_tasks->items; + if (l == 0) + continue; + while (1) { + jl_task_t *t = (jl_task_t*)lst[n]; + if (gc_marked(jl_astaggedvalue(t)->bits.gc)) { + n++; + } + else { + ndel++; + void *stkbuf = t->stkbuf; + size_t bufsz = t->bufsz; + if (stkbuf) { + t->stkbuf = NULL; + _jl_free_stack(ptls2, stkbuf, bufsz); + } + } + if (n >= l - ndel) + break; + void *tmp = lst[n]; + lst[n] = lst[n + ndel]; + lst[n + ndel] = tmp; + } + live_tasks->len -= ndel; + } +} diff --git a/src/gc.c b/src/gc.c index 8b9c945070b90..773612b425eb7 100644 --- a/src/gc.c +++ b/src/gc.c @@ -699,7 +699,7 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, static void sweep_weak_refs(void) { - for (int i = 0;i < jl_n_threads;i++) { + for (int i = 0; i < jl_n_threads; i++) { jl_ptls_t ptls2 = jl_all_tls_states[i]; size_t n = 0; size_t ndel = 0; @@ -710,7 +710,8 @@ static void sweep_weak_refs(void) while (1) { jl_weakref_t *wr = (jl_weakref_t*)lst[n]; if (gc_marked(jl_astaggedvalue(wr)->bits.gc)) { - // weakref itself is alive + // weakref itself is alive, + // so the user could still re-set it to a new value if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc)) wr->value = (jl_value_t*)jl_nothing; n++; @@ -722,7 +723,7 @@ static void sweep_weak_refs(void) break; void *tmp = lst[n]; lst[n] = lst[n + ndel]; - lst[n+ndel] = tmp; + lst[n + ndel] = tmp; } ptls2->heap.weak_refs.len -= ndel; } @@ -1026,7 +1027,7 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t int freedall = 1; int pg_skpd = 1; if (!pg->has_marked) { - // lazy version: (empty) if the whole page was already unused, free it + // lazy version: (empty) if the whole page was already unused, free it (return it to the pool) // eager version: (freedall) free page as soon as possible // the eager one uses less memory. // FIXME - need to do accounting on a per-thread basis @@ -2124,19 +2125,13 @@ mark: { objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_task_t)); jl_task_t *ta = (jl_task_t*)new_obj; gc_scrub_record_task(ta); - int stkbuf = (ta->stkbuf != (void*)(intptr_t)-1 && ta->stkbuf != NULL); + void *stkbuf = ta->stkbuf; int16_t tid = ta->tid; jl_ptls_t ptls2 = jl_all_tls_states[tid]; - if (stkbuf) { #ifdef COPY_STACKS - gc_setmark_buf_(ptls, ta->stkbuf, bits, ta->bufsz); -#else - // stkbuf isn't owned by julia for the root task - if (ta != ptls2->root_task) { - gc_setmark_buf_(ptls, ta->stkbuf, bits, ta->ssize); - } + if (stkbuf && ta->copy_stack) + gc_setmark_buf_(ptls, stkbuf, bits, ta->bufsz); #endif - } jl_gcframe_t *s = NULL; size_t nroots; uintptr_t offset = 0; @@ -2148,9 +2143,11 @@ mark: { else if (stkbuf) { s = ta->gcstack; #ifdef COPY_STACKS - ub = (uintptr_t)ptls2->stackbase; - lb = ub - ta->ssize; - offset = (uintptr_t)ta->stkbuf - lb; + if (ta->copy_stack) { + ub = (uintptr_t)ptls2->stackbase; + lb = ub - ta->copy_stack; + offset = (uintptr_t)stkbuf - lb; + } #endif } if (s) { @@ -2278,10 +2275,6 @@ static void mark_roots(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp) if (jl_all_methods != NULL) gc_mark_queue_obj(gc_cache, sp, jl_all_methods); -#ifndef COPY_STACKS - gc_mark_queue_obj(gc_cache, sp, jl_unprotect_stack_func); -#endif - // constants gc_mark_queue_obj(gc_cache, sp, jl_typetype_type); gc_mark_queue_obj(gc_cache, sp, jl_emptytuple_type); @@ -2564,6 +2557,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, int full) scanned_bytes = 0; // 5. start sweeping sweep_weak_refs(); + sweep_stack_pools(); gc_sweep_other(ptls, sweep_full); gc_scrub(); gc_verify_tags(); @@ -2687,6 +2681,7 @@ void jl_init_thread_heap(jl_ptls_t ptls) p[i].newpages = NULL; } arraylist_new(&heap->weak_refs, 0); + arraylist_new(&heap->live_tasks, 0); heap->mallocarrays = NULL; heap->mafreelist = NULL; heap->big_objects = NULL; diff --git a/src/gc.h b/src/gc.h index c6a284d7d32ab..51b8fca58c466 100644 --- a/src/gc.h +++ b/src/gc.h @@ -495,6 +495,7 @@ void gc_mark_queue_all_roots(jl_ptls_t ptls, gc_mark_sp_t *sp); void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, gc_mark_sp_t *sp, arraylist_t *list, size_t start); void gc_mark_loop(jl_ptls_t ptls, gc_mark_sp_t sp); +void sweep_stack_pools(void); void gc_debug_init(void); extern void *gc_mark_label_addrs[_GC_MARK_L_MAX]; diff --git a/src/init.c b/src/init.c index 5bfd0fbf3d48b..b2c5b3de76ec2 100644 --- a/src/init.c +++ b/src/init.c @@ -57,28 +57,18 @@ JL_DLLEXPORT const char* __asan_default_options() { size_t jl_page_size; -void jl_init_stack_limits(int ismaster) +void jl_init_stack_limits(int ismaster, void **stack_lo, void **stack_hi) { - jl_ptls_t ptls = jl_get_ptls_states(); #ifdef _OS_WINDOWS_ (void)ismaster; -# ifdef _COMPILER_MICROSOFT_ -# ifdef _P64 - void **tib = (void**)__readgsqword(0x30); -# else - void **tib = (void**)__readfsdword(0x18); -# endif -# else - void **tib; -# ifdef _P64 - __asm__("movq %%gs:0x30, %0" : "=r" (tib) : : ); -# else - __asm__("movl %%fs:0x18, %0" : "=r" (tib) : : ); -# endif -# endif // https://en.wikipedia.org/wiki/Win32_Thread_Information_Block - ptls->stack_hi = (char*)tib[1]; // Stack Base / Bottom of stack (high address) - ptls->stack_lo = (char*)tib[2]; // Stack Limit / Ceiling of stack (low address) +#ifdef _P64 + *stack_hi = (void**)__readgsqword(0x08); // Stack Base / Bottom of stack (high address) + *stack_lo = (void**)__readgsqword(0x10); // Stack Limit / Ceiling of stack (low address) +#else + *stack_hi = (void**)__readfsdword(0x04); // Stack Base / Bottom of stack (high address) + *stack_lo = (void**)__readfsdword(0x08); // Stack Limit / Ceiling of stack (low address) +#endif #else # ifdef JULIA_ENABLE_THREADING // Only use pthread_*_np functions to get stack address for non-master @@ -92,8 +82,8 @@ void jl_init_stack_limits(int ismaster) size_t stacksize; pthread_attr_getstack(&attr, &stackaddr, &stacksize); pthread_attr_destroy(&attr); - ptls->stack_lo = (char*)stackaddr; - ptls->stack_hi = (char*)stackaddr + stacksize; + *stack_lo = (void*)stackaddr; + *stack_hi = (void*)((char*)stackaddr + stacksize); return; # elif defined(_OS_DARWIN_) extern void *pthread_get_stackaddr_np(pthread_t thread); @@ -101,8 +91,8 @@ void jl_init_stack_limits(int ismaster) pthread_t thread = pthread_self(); void *stackaddr = pthread_get_stackaddr_np(thread); size_t stacksize = pthread_get_stacksize_np(thread); - ptls->stack_lo = (char*)stackaddr; - ptls->stack_hi = (char*)stackaddr + stacksize; + *stack_lo = (char*)stackaddr; + *stack_hi = (void*)((char*)stackaddr + stacksize); return; # elif defined(_OS_FREEBSD_) pthread_attr_t attr; @@ -112,11 +102,11 @@ void jl_init_stack_limits(int ismaster) size_t stacksize; pthread_attr_getstack(&attr, &stackaddr, &stacksize); pthread_attr_destroy(&attr); - ptls->stack_lo = (char*)stackaddr; - ptls->stack_hi = (char*)stackaddr + stacksize; + *stack_lo = (char*)stackaddr; + *stack_hi = (void*)((char*)stackaddr + stacksize); return; # else -# warning "Getting stack size for thread is not supported." +# warning "Getting precise stack size for thread is not supported." # endif } # else @@ -125,12 +115,12 @@ void jl_init_stack_limits(int ismaster) struct rlimit rl; getrlimit(RLIMIT_STACK, &rl); size_t stack_size = rl.rlim_cur; - ptls->stack_hi = (char*)&stack_size; - ptls->stack_lo = ptls->stack_hi - stack_size; + *stack_hi = (void*)&stack_size; + *stack_lo = (void*)((char*)*stack_hi - stack_size); #endif } -static void jl_find_stack_bottom(void) +static void jl_prep_sanitizers(void) { #if !defined(_OS_WINDOWS_) #if defined(JL_ASAN_ENABLED) || defined(JL_MSAN_ENABLED) @@ -153,7 +143,6 @@ static void jl_find_stack_bottom(void) } #endif #endif - jl_init_stack_limits(1); } struct uv_shutdown_queue_item { uv_handle_t *h; struct uv_shutdown_queue_item *next; }; @@ -642,7 +631,9 @@ void _julia_init(JL_IMAGE_SEARCH rel) total_mem = (size_t)-1; } jl_arr_xtralloc_limit = total_mem / 100; // Extra allocation limited to 1% of total RAM - jl_find_stack_bottom(); + jl_prep_sanitizers(); + void *stack_lo, *stack_hi; + jl_init_stack_limits(1, &stack_lo, &stack_hi); jl_dl_handle = jl_load_dynamic_library(NULL, JL_RTLD_DEFAULT, 1); #ifdef _OS_WINDOWS_ jl_ntdll_handle = jl_dlopen("ntdll.dll", 0); // bypass julia's pathchecking for system dlls @@ -711,7 +702,7 @@ void _julia_init(JL_IMAGE_SEARCH rel) jl_init_types(); jl_init_frontend(); jl_init_tasks(); - jl_init_root_task(ptls->stack_lo, ptls->stack_hi-ptls->stack_lo); + jl_init_root_task(stack_lo, stack_hi); #ifdef ENABLE_TIMINGS jl_root_task->timing_stack = jl_root_timing; diff --git a/src/julia.h b/src/julia.h index 5b35d52dc41d1..5c08ee051dc2c 100644 --- a/src/julia.h +++ b/src/julia.h @@ -29,6 +29,7 @@ # define MAX_ALIGN sizeof(void*) # endif #else +# include "win32_ucontext.h" # define jl_jmp_buf jmp_buf # include //for _resetstkoflw # define MAX_ALIGN 8 @@ -717,6 +718,8 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_1w(void); JL_DLLEXPORT jl_value_t *jl_gc_alloc_2w(void); JL_DLLEXPORT jl_value_t *jl_gc_alloc_3w(void); JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz); +JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, struct _jl_task_t *owner); +JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz); JL_DLLEXPORT void jl_gc_use(jl_value_t *a); JL_DLLEXPORT void jl_clear_malloc_data(void); @@ -1577,6 +1580,7 @@ JL_DLLEXPORT void jl_sigatomic_end(void); // tasks and exceptions ------------------------------------------------------- + typedef struct _jl_timing_block_t jl_timing_block_t; // info describing an exception handler typedef struct _jl_handler_t { @@ -1604,13 +1608,13 @@ typedef struct _jl_task_t { jl_value_t *backtrace; jl_value_t *logstate; jl_function_t *start; - jl_jmp_buf ctx; - size_t bufsz; - void *stkbuf; -// hidden fields: - size_t ssize; - size_t started:1; +// hidden state: + jl_ucontext_t ctx; // saved thread state + void *stkbuf; // malloc'd memory (either copybuf or stack) + size_t bufsz; // actual sizeof stkbuf + unsigned int copy_stack:31; // sizeof stack for copybuf + unsigned int started:1; // current exception handler jl_handler_t *eh; diff --git a/src/julia_internal.h b/src/julia_internal.h index 1426821edad03..564254a774f25 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -387,7 +387,6 @@ JL_DLLEXPORT void jl_typeassert(jl_value_t *x, jl_value_t *t); JL_CALLABLE(jl_f_tuple); JL_CALLABLE(jl_f_intrinsic_call); -extern jl_function_t *jl_unprotect_stack_func; void jl_install_default_signal_handlers(void); void restore_signals(void); void jl_install_thread_signal_handler(jl_ptls_t ptls); @@ -489,8 +488,8 @@ void jl_init_codegen(void); void jl_init_intrinsic_functions(void); void jl_init_intrinsic_properties(void); void jl_init_tasks(void) JL_GC_DISABLED; -void jl_init_stack_limits(int ismaster); -void jl_init_root_task(void *stack, size_t ssize); +void jl_init_stack_limits(int ismaster, void **stack_hi, void **stack_lo); +void jl_init_root_task(void *stack_lo, void *stack_hi); void jl_init_serializer(void); void jl_gc_init(void); void jl_init_signal_async(void); diff --git a/src/julia_threads.h b/src/julia_threads.h index 204e358a39763..b2f7c27fae84b 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -13,6 +13,44 @@ // JULIA_ENABLE_THREADING is switched on in Make.inc if JULIA_THREADS is // set (in Make.user) +// Options for task switching algorithm (in order of preference): +// JL_HAVE_ASM -- mostly setjmp +// JL_HAVE_UNW_CONTEXT -- hybrid of libunwind for start, setjmp for resume +// JL_HAVE_UCONTEXT -- posix standard API, requires syscall for resume +// JL_HAVE_SIGALTSTACK -- requires several syscall for start, setjmp for resume + +#ifdef _OS_WINDOWS_ +#define JL_HAVE_UCONTEXT +typedef win32_ucontext_t jl_ucontext_t; +#else +#if !defined(JL_HAVE_UCONTEXT) && \ + !defined(JL_HAVE_ASM) && \ + !defined(JL_HAVE_UNW_CONTEXT) && \ + !defined(JL_HAVE_SIGALTSTACK) +#if (defined(_CPU_X86_64_) || defined(_CPU_X86_) || defined(_CPU_AARCH64_) || defined(_CPU_ARM_)) +#define JL_HAVE_ASM +#elif defined(_OS_DARWIN_) +#define JL_HAVE_UNW_CONTEXT +#elif defined(_OS_LINUX_) +#define JL_HAVE_UCONTEXT +#else +#define JL_HAVE_UNW_CONTEXT +#endif +#endif + +#if defined(JL_HAVE_ASM) || defined(JL_HAVE_SIGALTSTACK) +typedef struct { + jl_jmp_buf uc_mcontext; +} jl_ucontext_t; +#endif +#if defined(JL_HAVE_UCONTEXT) || defined(JL_HAVE_UNW_CONTEXT) +#define UNW_LOCAL_ONLY +#include +typedef ucontext_t jl_ucontext_t; +#endif +#endif + + // Recursive spin lock typedef struct { volatile unsigned long owner; @@ -28,6 +66,9 @@ typedef struct { typedef struct { // variable for tracking weak references arraylist_t weak_refs; + // live tasks started on this thread + // that are holding onto a stack from the pool + arraylist_t live_tasks; // variables for tracking malloc'd arrays struct _mallocarray_t *mallocarrays; @@ -53,6 +94,9 @@ typedef struct { # define JL_GC_N_POOLS 43 #endif jl_gc_pool_t norm_pools[JL_GC_N_POOLS]; + +#define JL_N_STACK_POOLS 16 + arraylist_t free_stacks[JL_N_STACK_POOLS]; } jl_thread_heap_t; // Cache of thread local change to global metadata during GC @@ -101,10 +145,11 @@ struct _jl_tls_states_t { struct _jl_module_t *current_module; struct _jl_task_t *volatile current_task; struct _jl_task_t *root_task; +//#ifdef COPY_STACKS void *stackbase; - char *stack_lo; - char *stack_hi; - jl_jmp_buf base_ctx; // base context of stack + size_t stacksize; + jl_ucontext_t base_ctx; // base context of stack +//#endif jl_jmp_buf *safe_restore; int16_t tid; size_t bt_size; diff --git a/src/options.h b/src/options.h index b860edecce3dc..0c1ed879cbf4b 100644 --- a/src/options.h +++ b/src/options.h @@ -100,12 +100,10 @@ // task options --------------------------------------------------------------- -// select an implementation of stack switching. -// currently only COPY_STACKS is recommended. -#ifndef COPY_STACKS +// select whether to enable the COPY_STACKS stack switching optimization #define COPY_STACKS -#endif +#define JL_STACK_SIZE (8*1024*1024) // threading options ---------------------------------------------------------- diff --git a/src/signals-unix.c b/src/signals-unix.c index 298270b52aca0..85f947edfb2a0 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -182,13 +182,13 @@ static pthread_t signals_thread; static int is_addr_on_stack(jl_ptls_t ptls, void *addr) { -#ifdef COPY_STACKS - return ((char*)addr > (char*)ptls->stack_lo-3000000 && - (char*)addr < (char*)ptls->stack_hi); -#else - return ((char*)addr > (char*)ptls->current_task->stkbuf && - (char*)addr < (char*)ptls->current_task->stkbuf + ptls->current_task->ssize); -#endif + jl_task_t *t = ptls->current_task; + if (t->copy_stack) + return ((char*)addr > (char*)ptls->stackbase - ptls->stacksize && + (char*)addr < (char*)ptls->stackbase); + else + return ((char*)addr > (char*)t->stkbuf && + (char*)addr < (char*)t->stkbuf + t->bufsz); } static void sigdie_handler(int sig, siginfo_t *info, void *context) diff --git a/src/signals-win.c b/src/signals-win.c index 264c197e41c40..2af450725c951 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -3,7 +3,6 @@ // Windows #define sig_stack_size 131072 // 128k reserved for SEGV handling -static BOOL (*pSetThreadStackGuarantee)(PULONG); // Copied from MINGW_FLOAT_H which may not be found due to a collision with the builtin gcc float.h // eventually we can probably integrate this into OpenLibm. @@ -97,29 +96,47 @@ void __cdecl crt_sig_handler(int sig, int num) } } +static jl_ucontext_t collect_backtrace_fiber; +static jl_ucontext_t error_return_fiber; +static PCONTEXT error_ctx; +static int have_backtrace_fiber; +static void JL_NORETURN start_backtrace_fiber(void) +{ + jl_ptls_t ptls = jl_get_ptls_states(); + // collect the backtrace + ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, error_ctx); + // switch back to the execution fiber + jl_setcontext(&error_return_fiber); + abort(); +} + void restore_signals(void) { // turn on ctrl-c handler SetConsoleCtrlHandler(NULL, 0); - // see if SetThreadStackGuarantee exists - jl_dlsym(jl_kernel32_handle, "SetThreadStackGuarantee", (const void **)&pSetThreadStackGuarantee, 0); } -void jl_throw_in_ctx(jl_value_t *excpt, CONTEXT *ctxThread, int bt) +void jl_throw_in_ctx(jl_value_t *excpt, PCONTEXT ctxThread) { jl_ptls_t ptls = jl_get_ptls_states(); #if defined(_CPU_X86_64_) - DWORD64 Rsp = (ctxThread->Rsp&(DWORD64)-16) - 8; + DWORD64 Rsp = (ctxThread->Rsp & (DWORD64)-16) - 8; #elif defined(_CPU_X86_) - DWORD32 Esp = (ctxThread->Esp&(DWORD32)-16) - 4; + DWORD32 Esp = (ctxThread->Esp & (DWORD32)-16) - 4; #else #error WIN16 not supported :P #endif if (!ptls->safe_restore) { assert(excpt != NULL); - ptls->bt_size = bt ? rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, - ctxThread) : 0; - ptls->exception_in_transit = excpt; + ptls->bt_size = 0; + if (excpt != jl_stackovf_exception) { + ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, ctxThread); + } + else if (have_backtrace_fiber) { + error_ctx = ctxThread; + jl_swapcontext(&error_return_fiber, &collect_backtrace_fiber); + } + jl_exception_in_transit = excpt; } #if defined(_CPU_X86_64_) *(DWORD64*)Rsp = 0; @@ -160,7 +177,7 @@ static void jl_try_deliver_sigint(void) jl_safe_printf("error: GetThreadContext failed\n"); return; } - jl_throw_in_ctx(jl_interrupt_exception, &ctxThread, 1); + jl_throw_in_ctx(jl_interrupt_exception, &ctxThread); ctxThread.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; if (!SetThreadContext(hMainThread, &ctxThread)) { jl_safe_printf("error: SetThreadContext failed\n"); @@ -193,19 +210,17 @@ static BOOL WINAPI sigint_handler(DWORD wsig) //This needs winapi types to guara return 1; } -static LONG WINAPI _exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo, int in_ctx) +LONG WINAPI jl_exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo) { jl_ptls_t ptls = jl_get_ptls_states(); if (ExceptionInfo->ExceptionRecord->ExceptionFlags == 0) { switch (ExceptionInfo->ExceptionRecord->ExceptionCode) { case EXCEPTION_INT_DIVIDE_BY_ZERO: fpreset(); - jl_throw_in_ctx(jl_diverror_exception, - ExceptionInfo->ContextRecord,in_ctx); + jl_throw_in_ctx(jl_diverror_exception, ExceptionInfo->ContextRecord); return EXCEPTION_CONTINUE_EXECUTION; case EXCEPTION_STACK_OVERFLOW: - jl_throw_in_ctx(jl_stackovf_exception, - ExceptionInfo->ContextRecord,in_ctx&&pSetThreadStackGuarantee); + jl_throw_in_ctx(jl_stackovf_exception, ExceptionInfo->ContextRecord); return EXCEPTION_CONTINUE_EXECUTION; case EXCEPTION_ACCESS_VIOLATION: if (jl_addr_is_safepoint(ExceptionInfo->ExceptionRecord->ExceptionInformation[1])) { @@ -220,18 +235,16 @@ static LONG WINAPI _exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo, } else if (jl_safepoint_consume_sigint()) { jl_clear_force_sigint(); - jl_throw_in_ctx(jl_interrupt_exception, - ExceptionInfo->ContextRecord, in_ctx); + jl_throw_in_ctx(jl_interrupt_exception, ExceptionInfo->ContextRecord); } return EXCEPTION_CONTINUE_EXECUTION; } if (ptls->safe_restore) { - jl_throw_in_ctx(NULL, ExceptionInfo->ContextRecord, in_ctx); + jl_throw_in_ctx(NULL, ExceptionInfo->ContextRecord); return EXCEPTION_CONTINUE_EXECUTION; } if (ExceptionInfo->ExceptionRecord->ExceptionInformation[0] == 1) { // writing to read-only memory (e.g. mmap) - jl_throw_in_ctx(jl_readonlymemory_exception, - ExceptionInfo->ContextRecord,in_ctx); + jl_throw_in_ctx(jl_readonlymemory_exception, ExceptionInfo->ContextRecord); return EXCEPTION_CONTINUE_EXECUTION; } } @@ -298,38 +311,6 @@ static LONG WINAPI _exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo, return EXCEPTION_CONTINUE_SEARCH; } -static LONG WINAPI exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo) -{ - return _exception_handler(ExceptionInfo,1); -} - -#if defined(_CPU_X86_64_) -JL_DLLEXPORT EXCEPTION_DISPOSITION __julia_personality( - PEXCEPTION_RECORD ExceptionRecord, - void *EstablisherFrame, - PCONTEXT ContextRecord, - void *DispatcherContext) -{ - EXCEPTION_POINTERS ExceptionInfo; - ExceptionInfo.ExceptionRecord = ExceptionRecord; - ExceptionInfo.ContextRecord = ContextRecord; - - EXCEPTION_DISPOSITION rval; - switch (_exception_handler(&ExceptionInfo,1)) { - case EXCEPTION_CONTINUE_EXECUTION: - rval = ExceptionContinueExecution; break; - case EXCEPTION_CONTINUE_SEARCH: - rval = ExceptionContinueSearch; break; -#ifndef _MSC_VER - case EXCEPTION_EXECUTE_HANDLER: - rval = ExceptionExecuteHandler; break; -#endif - } - - return rval; -} -#endif - JL_DLLEXPORT void jl_install_sigint_handler(void) { SetConsoleCtrlHandler((PHANDLER_ROUTINE)sigint_handler,1); @@ -428,17 +409,15 @@ void jl_install_default_signal_handlers(void) if (signal(SIGABRT, (void (__cdecl *)(int))crt_sig_handler) == SIG_ERR) { jl_error("fatal error: Couldn't set SIGABRT"); } - SetUnhandledExceptionFilter(exception_handler); + SetUnhandledExceptionFilter(jl_exception_handler); } void jl_install_thread_signal_handler(jl_ptls_t ptls) { - (void)ptls; - // Ensure the stack overflow handler has enough space to collect the backtrace - ULONG StackSizeInBytes = sig_stack_size; - if (pSetThreadStackGuarantee) { - if (!pSetThreadStackGuarantee(&StackSizeInBytes)) { - pSetThreadStackGuarantee = NULL; - } - } + size_t ssize = sig_stack_size; + void *stk = jl_malloc_stack(&ssize, NULL); + collect_backtrace_fiber.uc_stack.ss_sp = (void*)stk; + collect_backtrace_fiber.uc_stack.ss_size = ssize; + jl_makecontext(&collect_backtrace_fiber, start_backtrace_fiber); + have_backtrace_fiber = 1; } diff --git a/src/support/END.h b/src/support/END.h index 3f551aca5716d..090bbc02eeb1c 100644 --- a/src/support/END.h +++ b/src/support/END.h @@ -36,13 +36,15 @@ #if defined(__linux__) || defined(__FreeBSD__) || defined(__ELF__) .size CNAME, . - CNAME #else -#ifndef _MSC_VER -.end -#else +#ifdef _MSC_VER CNAME endp -end +#else +#ifdef _WIN64 +.seh_endproc #endif #endif +#endif + #undef CNAME #undef HIDENAME diff --git a/src/support/ENTRY.amd64.h b/src/support/ENTRY.amd64.h index 4ad13fc421da4..b8049f0711f89 100644 --- a/src/support/ENTRY.amd64.h +++ b/src/support/ENTRY.amd64.h @@ -66,7 +66,9 @@ _START_ENTRY .scl 2 .type 32 .endef +.seh_proc EXT(CNAME) EXT(CNAME): +.seh_endprologue #else .code CNAME proc diff --git a/src/support/Makefile b/src/support/Makefile index ffd1bb7c135f3..66964b191585b 100644 --- a/src/support/Makefile +++ b/src/support/Makefile @@ -9,13 +9,11 @@ override CPPFLAGS += $(JCPPFLAGS) SRCS := hashing timefuncs ptrhash operators utf8 ios htable bitvector \ int2str libsupportinit arraylist strtod ifeq ($(OS),WINNT) -SRCS += asprintf strptime +SRCS += asprintf strptime win32_ucontext ifeq ($(ARCH),i686) -SRCS += _setjmp.win32 _longjmp.win32 -else ifeq ($(ARCH),i386) -SRCS += _setjmp.win32 _longjmp.win32 +SRCS += _setjmp.win32 else ifeq ($(ARCH),x86_64) -SRCS += _setjmp.win64 _longjmp.win64 +SRCS += _setjmp.win64 endif endif ifeq ($(USEMSVC), 1) diff --git a/src/support/_longjmp.win32.S b/src/support/_longjmp.win32.S deleted file mode 100644 index 422a743422a7b..0000000000000 --- a/src/support/_longjmp.win32.S +++ /dev/null @@ -1,68 +0,0 @@ -/* $NetBSD: _setjmp.S,v 1.8 2005/10/05 20:18:12 christos Exp $ */ - -/*- - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * William Jolitz. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: @(#)_setjmp.s 5.1 (Berkeley) 4/23/90 - */ - -/*#include -#if defined(LIBC_SCCS) - RCSID("$NetBSD: _setjmp.S,v 1.8 2005/10/05 20:18:12 christos Exp $") -#endif -*/ - -/* - * C library -- _setjmp, _longjmp - * - * _longjmp(a,v) - * will generate a "return(v)" from the last call to - * _setjmp(a) - * by restoring registers from the stack. - * The previous signal state is NOT restored. - */ - -#define CNAME jl_longjmp -#include "ENTRY.i387.h" - mov edx,DWORD PTR [esp+4] - mov eax,DWORD PTR [esp+8] - mov ebp,DWORD PTR [edx+0] - mov ebx,DWORD PTR [edx+4] - mov edi,DWORD PTR [edx+8] - mov esi,DWORD PTR [edx+12] - mov esp,DWORD PTR [edx+16] - mov ecx,DWORD PTR [edx+20] - test eax,eax - jne a - inc eax -a: mov DWORD PTR [esp],ecx - ret -#include "END.h" diff --git a/src/support/_longjmp.win64.S b/src/support/_longjmp.win64.S deleted file mode 100644 index 5bb17ea190586..0000000000000 --- a/src/support/_longjmp.win64.S +++ /dev/null @@ -1,29 +0,0 @@ -#define CNAME jl_longjmp -#include "ENTRY.amd64.h" - mov rbx,QWORD PTR [rcx+8] - mov rsp,QWORD PTR [rcx+16] - mov rbp,QWORD PTR [rcx+24] - mov rsi,QWORD PTR [rcx+32] - mov rdi,QWORD PTR [rcx+40] - mov r12,QWORD PTR [rcx+48] - mov r13,QWORD PTR [rcx+56] - mov r14,QWORD PTR [rcx+64] - mov r15,QWORD PTR [rcx+72] - mov r8, QWORD PTR [rcx+80] - movaps xmm6,XMMWORD PTR [rcx+96] - movaps xmm7,XMMWORD PTR [rcx+112] - movaps xmm8,XMMWORD PTR [rcx+128] - movaps xmm9,XMMWORD PTR [rcx+144] - movaps xmm10,XMMWORD PTR [rcx+160] - movaps xmm11,XMMWORD PTR [rcx+176] - movaps xmm12,XMMWORD PTR [rcx+192] - movaps xmm13,XMMWORD PTR [rcx+208] - movaps xmm14,XMMWORD PTR [rcx+224] - movaps xmm15,XMMWORD PTR [rcx+240] - mov eax,edx - test eax,eax - jne a - inc eax -a: mov QWORD PTR [rsp],r8 - ret -#include "END.h" diff --git a/src/support/_setjmp.win32.S b/src/support/_setjmp.win32.S index d3e6bba074733..441872dd4261a 100644 --- a/src/support/_setjmp.win32.S +++ b/src/support/_setjmp.win32.S @@ -48,18 +48,112 @@ * _setjmp(a) * by restoring registers from the stack. * The previous signal state is NOT restored. + * _swapcontext(a, b) + * Store the current context in a and resume in context b + * + * TODO: save/restore floating point control state + * and reset avx state + * and update fs:[0xEOC] to contain the address of the stack */ #define CNAME jl_setjmp +#include "ENTRY.i387.h" + mov eax,DWORD PTR [esp+4] // arg 1 + mov edx,DWORD PTR [esp+0] // rta + mov DWORD PTR [eax+0],ebp + mov DWORD PTR [eax+4],ebx + mov DWORD PTR [eax+8],edi + mov DWORD PTR [eax+12],esi + mov DWORD PTR [eax+16],esp + mov DWORD PTR [eax+20],edx // eip + mov edx,DWORD PTR fs:[0] // seh registration + mov DWORD PTR [eax+24],edx + xor eax,eax # return 0 + ret +#include "END.h" + + +#define CNAME jl_longjmp +#include "ENTRY.i387.h" + mov edx,DWORD PTR [esp+4] // arg 1 + mov eax,DWORD PTR [esp+8] // arg 2 + mov ebp,DWORD PTR [edx+24] // seh registration + mov ecx,DWORD PTR [edx+20] // eip + mov esp,DWORD PTR [edx+16] + mov esi,DWORD PTR [edx+12] + mov edi,DWORD PTR [edx+8] + mov ebx,DWORD PTR [edx+4] + mov DWORD PTR fs:[0],ebp + mov ebp,DWORD PTR [edx+0] + mov DWORD PTR [esp],ecx + test eax,eax + jne a + inc eax +a: ret // jmp ecx +#include "END.h" + + +#define CNAME jl_swapcontext #include "ENTRY.i387.h" mov eax,DWORD PTR [esp+4] + // save stack registers + mov edx,DWORD PTR fs:[8] // stack top (low) + mov ecx,DWORD PTR fs:[4] // stack bottom (high) + mov DWORD PTR [eax+0],edx // sp + sub ecx,edx + mov DWORD PTR [eax+4],ecx // ssize + add eax,8 + // save uc_mcontext mov edx,DWORD PTR [esp+0] - mov DWORD PTR [eax+0],ebp /* rta */ + mov ecx,DWORD PTR fs:[0] + mov DWORD PTR [eax+0],ebp mov DWORD PTR [eax+4],ebx mov DWORD PTR [eax+8],edi mov DWORD PTR [eax+12],esi mov DWORD PTR [eax+16],esp mov DWORD PTR [eax+20],edx - xor eax,eax + mov DWORD PTR [eax+24],ecx + add esp,4 + jmp _jl_setcontext +#include "END.h" + + +#define CNAME jl_setcontext +#include "ENTRY.i387.h" + mov eax,DWORD PTR [esp+4] + // restore stack registers + mov edx,DWORD PTR [eax+0] + mov ecx,DWORD PTR [eax+4] + mov DWORD PTR fs:[8],edx // stack top (low) + add ecx,edx + mov DWORD PTR fs:[4],ecx // stack bottom (high) + add eax,8 + // restore uc_mcontext + mov ebp,DWORD PTR [eax+24] + mov ecx,DWORD PTR [eax+20] + mov esp,DWORD PTR [eax+16] + mov esi,DWORD PTR [eax+12] + mov edi,DWORD PTR [eax+8] + mov ebx,DWORD PTR [eax+4] + mov DWORD PTR fs:[0],ebp + mov ebp,DWORD PTR [eax+0] + mov DWORD PTR [esp],ecx + xor eax,eax # return 0 + inc eax # HACK: return 1 + ret +#include "END.h" + + +#define CNAME __readgs +#include "ENTRY.i387.h" + mov eax,gs + ret +#include "END.h" + + +#define CNAME __readgsdword +#include "ENTRY.i387.h" + mov eax,DWORD PTR [esp+4] + mov eax,gs:[eax] ret #include "END.h" diff --git a/src/support/_setjmp.win64.S b/src/support/_setjmp.win64.S index d490bae0b6d1f..cb512cfe4ab3e 100644 --- a/src/support/_setjmp.win64.S +++ b/src/support/_setjmp.win64.S @@ -1,7 +1,16 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +/* + * TODO: save/restore floating point control state + * and reset avx state + * and update gs:[0x1478] to contain the address of the stack + */ + #define CNAME jl_setjmp #include "ENTRY.amd64.h" - mov rdx,QWORD PTR [rsp] - mov QWORD PTR [rcx],0 + mov rdx,QWORD PTR [rsp] // rta + mov rax,QWORD PTR gs:[0] // SEH + mov QWORD PTR [rcx+0],rax mov QWORD PTR [rcx+8],rbx mov QWORD PTR [rcx+16],rsp mov QWORD PTR [rcx+24],rbp @@ -11,7 +20,7 @@ mov QWORD PTR [rcx+56],r13 mov QWORD PTR [rcx+64],r14 mov QWORD PTR [rcx+72],r15 - mov QWORD PTR [rcx+80],rdx + mov QWORD PTR [rcx+80],rdx // rip mov QWORD PTR [rcx+88],0 movaps XMMWORD PTR [rcx+96],xmm6 movaps XMMWORD PTR [rcx+112],xmm7 @@ -23,6 +32,117 @@ movaps XMMWORD PTR [rcx+208],xmm13 movaps XMMWORD PTR [rcx+224],xmm14 movaps XMMWORD PTR [rcx+240],xmm15 - xor rax,rax + xor rax,rax # return 0 + ret +#include "END.h" + + +#define CNAME jl_longjmp +#include "ENTRY.amd64.h" + mov rax,QWORD PTR [rcx+0] + mov rbx,QWORD PTR [rcx+8] + mov rsp,QWORD PTR [rcx+16] + mov rbp,QWORD PTR [rcx+24] + mov rsi,QWORD PTR [rcx+32] + mov rdi,QWORD PTR [rcx+40] + mov r12,QWORD PTR [rcx+48] + mov r13,QWORD PTR [rcx+56] + mov r14,QWORD PTR [rcx+64] + mov r15,QWORD PTR [rcx+72] + mov r8, QWORD PTR [rcx+80] + movaps xmm6,XMMWORD PTR [rcx+96] + movaps xmm7,XMMWORD PTR [rcx+112] + movaps xmm8,XMMWORD PTR [rcx+128] + movaps xmm9,XMMWORD PTR [rcx+144] + movaps xmm10,XMMWORD PTR [rcx+160] + movaps xmm11,XMMWORD PTR [rcx+176] + movaps xmm12,XMMWORD PTR [rcx+192] + movaps xmm13,XMMWORD PTR [rcx+208] + movaps xmm14,XMMWORD PTR [rcx+224] + movaps xmm15,XMMWORD PTR [rcx+240] + mov QWORD PTR gs:[0],rax + mov eax,edx // move arg2 to return + test eax,eax + jne a + inc eax +a: mov QWORD PTR [rsp],r8 + ret +#include "END.h" + + +#define CNAME jl_swapcontext +#include "ENTRY.amd64.h" + // save stack registers + mov r8,QWORD PTR gs:[16] // stack top (low) + mov rax,QWORD PTR gs:[8] // stack bottom (high) + mov QWORD PTR [rcx+0],r8 // sp + sub rax,r8 + mov QWORD PTR [rcx+8],rax // ssize + add rcx,16 + // save uc_mcontext + mov r8,QWORD PTR [rsp] // rta + mov rax,QWORD PTR gs:[0] // SEH + mov QWORD PTR [rcx+0],rax + mov QWORD PTR [rcx+8],rbx + mov QWORD PTR [rcx+16],rsp + mov QWORD PTR [rcx+24],rbp + mov QWORD PTR [rcx+32],rsi + mov QWORD PTR [rcx+40],rdi + mov QWORD PTR [rcx+48],r12 + mov QWORD PTR [rcx+56],r13 + mov QWORD PTR [rcx+64],r14 + mov QWORD PTR [rcx+72],r15 + mov QWORD PTR [rcx+80],r8 // rip + mov QWORD PTR [rcx+88],0 + movaps XMMWORD PTR [rcx+96],xmm6 + movaps XMMWORD PTR [rcx+112],xmm7 + movaps XMMWORD PTR [rcx+128],xmm8 + movaps XMMWORD PTR [rcx+144],xmm9 + movaps XMMWORD PTR [rcx+160],xmm10 + movaps XMMWORD PTR [rcx+176],xmm11 + movaps XMMWORD PTR [rcx+192],xmm12 + movaps XMMWORD PTR [rcx+208],xmm13 + movaps XMMWORD PTR [rcx+224],xmm14 + movaps XMMWORD PTR [rcx+240],xmm15 + mov rcx,rdx + jmp jl_setcontext +#include "END.h" + + +#define CNAME jl_setcontext +#include "ENTRY.amd64.h" + // restore stack registers + mov r8,QWORD PTR [rcx+0] + mov rax,QWORD PTR [rcx+8] + mov QWORD PTR gs:[16],r8 // stack top (low) + add rax,r8 + mov QWORD PTR gs:[8],rax // stack bottom (high) + add rcx,16 + // restore uc_mcontext + mov rax,QWORD PTR [rcx+0] + mov rbx,QWORD PTR [rcx+8] + mov rsp,QWORD PTR [rcx+16] + mov rbp,QWORD PTR [rcx+24] + mov rsi,QWORD PTR [rcx+32] + mov rdi,QWORD PTR [rcx+40] + mov r12,QWORD PTR [rcx+48] + mov r13,QWORD PTR [rcx+56] + mov r14,QWORD PTR [rcx+64] + mov r15,QWORD PTR [rcx+72] + mov r8, QWORD PTR [rcx+80] + movaps xmm6,XMMWORD PTR [rcx+96] + movaps xmm7,XMMWORD PTR [rcx+112] + movaps xmm8,XMMWORD PTR [rcx+128] + movaps xmm9,XMMWORD PTR [rcx+144] + movaps xmm10,XMMWORD PTR [rcx+160] + movaps xmm11,XMMWORD PTR [rcx+176] + movaps xmm12,XMMWORD PTR [rcx+192] + movaps xmm13,XMMWORD PTR [rcx+208] + movaps xmm14,XMMWORD PTR [rcx+224] + movaps xmm15,XMMWORD PTR [rcx+240] + mov QWORD PTR gs:[0],rax + mov QWORD PTR [rsp],r8 + xor rax,rax # return 0 + inc rax # HACK: return 1 ret #include "END.h" diff --git a/src/support/win32_ucontext.c b/src/support/win32_ucontext.c new file mode 100644 index 0000000000000..df50eb209341e --- /dev/null +++ b/src/support/win32_ucontext.c @@ -0,0 +1,89 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#include "win32_ucontext.h" + +#define WIN32_LEAN_AND_MEAN +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern LONG WINAPI jl_exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo); + +// Instead of using ntdll!_except_handler4, we call directly to our UnhandledExceptionFilter. +// This seems to work better, since it's unclear if we have made a valid frame +// that meets the expectations of that internal functions. +JL_DLLEXPORT EXCEPTION_DISPOSITION NTAPI __julia_personality( + PEXCEPTION_RECORD ExceptionRecord, + void *EstablisherFrame, + PCONTEXT ContextRecord, + void *DispatcherContext) +{ + EXCEPTION_POINTERS ExceptionInfo; + ExceptionInfo.ExceptionRecord = ExceptionRecord; + ExceptionInfo.ContextRecord = ContextRecord; + + EXCEPTION_DISPOSITION rval; + switch (jl_exception_handler(&ExceptionInfo)) { +#ifndef _MSC_VER + case EXCEPTION_EXECUTE_HANDLER: + rval = ExceptionExecuteHandler; + break; +#endif + case EXCEPTION_CONTINUE_EXECUTION: + rval = ExceptionContinueExecution; + break; + case EXCEPTION_CONTINUE_SEARCH: JL_FALLTHROUGH; + default: + rval = ExceptionContinueSearch; + break; + } + + return rval; +} + + +void jl_makecontext(win32_ucontext_t *ucp, void (*func)(void)) +{ + char *stack_top = (char*)ucp->uc_stack.ss_sp + ucp->uc_stack.ss_size; +#if defined(_CPU_X86_64_) + stack_top -= 0x20; // shadow stack +#elif defined(_CPU_X86_) + // install unhandled SEH EXCEPTION_REGISTRATION_RECORD at top of stack + static PEXCEPTION_ROUTINE UnHandler; + if (UnHandler == NULL) { + PEXCEPTION_REGISTRATION_RECORD fs0 = (PEXCEPTION_REGISTRATION_RECORD)__readfsdword(0x0); + while (fs0->Next != (PEXCEPTION_REGISTRATION_RECORD)0xFFFFFFFF) + fs0 = fs0->Next; + UnHandler = fs0->Handler; + } + stack_top -= 0x20; + PEXCEPTION_REGISTRATION_RECORD Registration = (PEXCEPTION_REGISTRATION_RECORD)stack_top; + Registration[0].Next = &Registration[1]; + Registration[0].Handler = &__julia_personality; + Registration[1].Next = (PEXCEPTION_REGISTRATION_RECORD)0xFFFFFFFF; + Registration[1].Handler = UnHandler; +#endif + stack_top -= sizeof(void*); + *(void**)stack_top = 0; // push rta + stack_top -= sizeof(void*); // stack space for ret + _JUMP_BUFFER *jmpbuf = (_JUMP_BUFFER*)&ucp->uc_mcontext; +#if defined(_CPU_X86_64_) + jmpbuf->Rip = (unsigned long long)func; + jmpbuf->Rsp = (unsigned long long)stack_top; + jmpbuf->Rbp = 0; + jmpbuf->Frame = 0; // SEH frame +#elif defined(_CPU_X86_) + jmpbuf->Eip = (unsigned long)func; + jmpbuf->Esp = (unsigned long)stack_top; + jmpbuf->Ebp = 0; + jmpbuf->Registration = (unsigned long)&Registration[0]; // SEH frame +#else +#error jl_makecontext not defined for CPU type +#endif +} + +#ifdef __cplusplus +} +#endif diff --git a/src/support/win32_ucontext.h b/src/support/win32_ucontext.h new file mode 100644 index 0000000000000..01daecc292a8a --- /dev/null +++ b/src/support/win32_ucontext.h @@ -0,0 +1,27 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#ifndef JL_WINUCONTEXT_H +#define JL_WINUCONTEXT_H + +#include "dtypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#include +typedef struct { + struct stack_t { + void *ss_sp; + size_t ss_size; + } uc_stack; + jmp_buf uc_mcontext; +} win32_ucontext_t; +void jl_makecontext(win32_ucontext_t *ucp, void (*func)(void)); +void jl_swapcontext(win32_ucontext_t *oucp, const win32_ucontext_t *ucp); +void jl_setcontext(const win32_ucontext_t *ucp); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/src/task.c b/src/task.c index f83dfcf6df561..75656b82d0da7 100644 --- a/src/task.c +++ b/src/task.c @@ -5,15 +5,25 @@ lightweight processes (symmetric coroutines) */ -//// enable this for ifndef COPY_STACKS to work on linux -//#ifdef _FORTIFY_SOURCE -//// disable __longjmp_chk validation so that we can jump between stacks -//#pragma push_macro("_FORTIFY_SOURCE") -//#undef _FORTIFY_SOURCE -//#include -//#pragma pop_macro("_FORTIFY_SOURCE") +// need this to get the real definition of ucontext_t, +// if we're going to use the ucontext_t implementation there +//#if defined(__APPLE__) && defined(JL_HAVE_UCONTEXT) +//#pragma push_macro("_XOPEN_SOURCE") +//#define _XOPEN_SOURCE +//#include +//#pragma pop_macro("_XOPEN_SOURCE") //#endif +// this is needed for !COPY_STACKS to work on linux +#ifdef _FORTIFY_SOURCE +// disable __longjmp_chk validation so that we can jump between stacks +// (which would normally be invalid to do with setjmp / longjmp) +#pragma push_macro("_FORTIFY_SOURCE") +#undef _FORTIFY_SOURCE +#include +#pragma pop_macro("_FORTIFY_SOURCE") +#endif + #include "platform.h" #include @@ -31,173 +41,101 @@ extern "C" { #endif #if defined(_OS_WINDOWS_) -#include -#include volatile int jl_in_stackwalk = 0; #else -#include -#include // for mprotect -#include // for dladdr +#include // mmap +#ifdef JL_HAVE_UCONTEXT +#include #endif - -/* This probing code is derived from Douglas Jones' user thread library */ -static void _probe_arch(void); - -#ifndef __clang_analyzer__ -/* true if stack grows up, false if down */ -static int _stack_grows_up; - -/* the offset of the beginning of the stack frame in a function */ -static size_t _frame_offset; - -struct _probe_data { - intptr_t low_bound; /* below probe on stack */ - intptr_t probe_local; /* local to probe on stack */ - intptr_t high_bound; /* above probe on stack */ - intptr_t prior_local; /* value of probe_local from earlier call */ - - jl_jmp_buf probe_env; /* saved environment of probe */ - jl_jmp_buf probe_sameAR; /* second environment saved by same call */ - jl_jmp_buf probe_samePC; /* environment saved on previous call */ - - jl_jmp_buf * ref_probe; /* switches between probes */ -}; - -static void boundhigh(struct _probe_data *p) -{ - int c; - p->high_bound = (intptr_t)&c; -} - -static void probe(struct _probe_data *p) -{ - p->prior_local = p->probe_local; - p->probe_local = (intptr_t)&p; - jl_setjmp( *(p->ref_probe), 0 ); - p->ref_probe = &p->probe_env; - jl_setjmp( p->probe_sameAR, 0 ); - boundhigh(p); -} - -static void boundlow(struct _probe_data *p) -{ - p->low_bound = (intptr_t)&p; - probe(p); -} - -// we need this function to exist so we can measure its stack frame! -static void NOINLINE_DECL(fill(struct _probe_data *p)); - -static void fill(struct _probe_data *p) -{ - boundlow(p); -} - -static void _infer_direction_from(int *first_addr) -{ - int second; - _stack_grows_up = (first_addr < &second); -} - -static void _infer_stack_direction(void) -{ - int first; - _infer_direction_from(&first); -} - -static int mangle_pointers; - -static void _probe_arch(void) -{ - struct _probe_data p; - memset(p.probe_env, 0, sizeof(jl_jmp_buf)); - memset(p.probe_sameAR, 0, sizeof(jl_jmp_buf)); - memset(p.probe_samePC, 0, sizeof(jl_jmp_buf)); - p.ref_probe = &p.probe_samePC; - - _infer_stack_direction(); - - /* do a probe with filler on stack */ - fill(&p); - /* do a probe without filler */ - boundlow(&p); - -#if defined(__linux__) && defined(__i386__) - jl_ptls_t ptls = jl_get_ptls_states(); - char **s = (char**)p.ref_probe; - mangle_pointers = !(s[4] > ptls->stack_lo && - s[4] < ptls->stack_hi); -#elif defined(__linux__) && defined(__x86_64__) - jl_ptls_t ptls = jl_get_ptls_states(); - char **s = (char**)p.ref_probe; - mangle_pointers = !(s[6] > ptls->stack_lo && - s[6] < ptls->stack_hi); -#else - mangle_pointers = 0; #endif - intptr_t prior_diff = p.probe_local - p.prior_local; - _frame_offset = labs(prior_diff); -} +// empirically, finish_task needs about 64k stack space to infer/run +// and additionally, gc-stack reserves 64k for the guard pages +#if defined(MINSIGSTKSZ) && MINSIGSTKSZ > 131072 +#define MINSTKSZ MINSIGSTKSZ +#else +#define MINSTKSZ 131072 #endif -/* end probing code */ - static jl_sym_t *done_sym; static jl_sym_t *failed_sym; static jl_sym_t *runnable_sym; extern size_t jl_page_size; jl_datatype_t *jl_task_type; +static char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner); +static void jl_set_fiber(jl_ucontext_t *t); +static void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t); +static void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t); -#ifdef COPY_STACKS -#if (defined(_CPU_X86_64_) || defined(_CPU_X86_) || defined(_CPU_AARCH64_) || defined(_CPU_ARM_)) && !defined(_COMPILER_MICROSOFT_) -#define ASM_COPY_STACKS +#ifdef JL_HAVE_UNW_CONTEXT +static JL_THREAD unw_cursor_t jl_basecursor; #endif +#ifdef COPY_STACKS + +static void memcpy_a16(uint64_t *to, uint64_t *from, size_t nb) +{ + memcpy((char*)jl_assume_aligned(to, 16), (char*)jl_assume_aligned(from, 16), nb); + //uint64_t *end = (uint64_t*)((char*)from + nb); + //while (from < end) + // *(to++) = *(from++); +} + static void NOINLINE save_stack(jl_ptls_t ptls, jl_task_t *lastt, jl_task_t **pt) { - if (lastt->state == done_sym || lastt->state == failed_sym) - return; - char *frame_addr = (char*)jl_get_frame_addr(); + char *frame_addr = (char*)((uintptr_t)jl_get_frame_addr() & ~15); char *stackbase = (char*)ptls->stackbase; - size_t nb = stackbase > frame_addr ? stackbase - frame_addr : 0; - char *buf; - if (lastt->stkbuf == NULL || lastt->bufsz < nb) { - buf = (char*)jl_gc_alloc_buf(ptls, nb); + assert(stackbase > frame_addr); + size_t nb = stackbase - frame_addr; + void *buf; + if (lastt->bufsz < nb) { + buf = (void*)jl_gc_alloc_buf(ptls, nb); lastt->stkbuf = buf; lastt->bufsz = nb; } else { - buf = (char*)lastt->stkbuf; + buf = lastt->stkbuf; } - lastt->ssize = nb; *pt = lastt; // clear the gc-root for the target task before copying the stack for saving - memcpy(buf, frame_addr, nb); + lastt->copy_stack = nb; + memcpy_a16((uint64_t*)buf, (uint64_t*)frame_addr, nb); // this task's stack could have been modified after // it was marked by an incremental collection // move the barrier back instead of walking it again here jl_gc_wb_back(lastt); } -static void NOINLINE restore_stack(jl_ptls_t ptls, char *p) +static void NOINLINE JL_NORETURN restore_stack(jl_ptls_t ptls, char *p) { jl_task_t *t = ptls->current_task; - char *_x = (char*)ptls->stackbase - t->ssize; + size_t nb = t->copy_stack; + char *_x = (char*)ptls->stackbase - nb; if (!p) { + // switch to a stackframe that's beyond the bounds of the last switch p = _x; if ((char*)&_x > _x) { p = (char*)alloca((char*)&_x - _x); } - restore_stack(ptls, p); // pass p to ensure the compiler can't tailcall this + restore_stack(ptls, p); // pass p to ensure the compiler can't tailcall this or avoid the alloca } assert(t->stkbuf != NULL); - memcpy(_x, t->stkbuf, t->ssize); // destroys all but the current stackframe - jl_longjmp(t->ctx, 1); + memcpy_a16((uint64_t*)_x, (uint64_t*)t->stkbuf, nb); // destroys all but the current stackframe + jl_set_fiber(&t->ctx); + abort(); // unreachable +} +static void restore_stack2(jl_ptls_t ptls, jl_task_t *lastt) +{ + jl_task_t *t = ptls->current_task; + size_t nb = t->copy_stack; + char *_x = (char*)ptls->stackbase - nb; + assert(t->stkbuf != NULL); + memcpy_a16((uint64_t*)_x, (uint64_t*)t->stkbuf, nb); // destroys all but the current stackframe + jl_swap_fiber(&lastt->ctx, &t->ctx); } #endif -static jl_function_t *task_done_hook_func=NULL; +static jl_function_t *task_done_hook_func = NULL; static void JL_NORETURN finish_task(jl_task_t *t, jl_value_t *resultval JL_MAYBE_UNROOTED) { @@ -207,12 +145,10 @@ static void JL_NORETURN finish_task(jl_task_t *t, jl_value_t *resultval JL_MAYBE t->state = failed_sym; else t->state = done_sym; + if (t->copy_stack) // early free of stkbuf + t->stkbuf = NULL; t->result = resultval; jl_gc_wb(t, t->result); - // TODO: early free of t->stkbuf -#ifdef COPY_STACKS - t->stkbuf = (void*)(intptr_t)-1; -#endif // ensure that state is cleared ptls->in_finalizer = 0; ptls->in_pure_callback = 0; @@ -249,59 +185,9 @@ static void record_backtrace(void) JL_NOTSAFEPOINT ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE); } -static void NOINLINE JL_NORETURN JL_USED_FUNC start_task(void) -{ - jl_ptls_t ptls = jl_get_ptls_states(); - // this runs the first time we switch to a task - jl_task_t *t = ptls->current_task; - jl_value_t *res; - t->started = 1; - if (t->exception != jl_nothing) { - record_backtrace(); - res = t->exception; - } - else { - JL_TRY { - if (ptls->defer_signal) { - ptls->defer_signal = 0; - jl_sigint_safepoint(ptls); - } - JL_TIMING(ROOT); - ptls->world_age = jl_world_counter; - res = jl_apply(&t->start, 1); - } - JL_CATCH { - res = ptls->exception_in_transit; - t->exception = res; - jl_gc_wb(t, res); - } - } - finish_task(t, res); - gc_debug_critical_error(); - abort(); -} - -#ifdef COPY_STACKS -void NOINLINE jl_set_base_ctx(char *__stk) -{ - jl_ptls_t ptls = jl_get_ptls_states(); - ptls->stackbase = (char*)(((uintptr_t)__stk + sizeof(*__stk))&-16); // also ensures stackbase is 16-byte aligned -#ifndef ASM_COPY_STACKS - if (jl_setjmp(ptls->base_ctx, 0)) { - start_task(); - } -#endif -} -#endif JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel) { - // keep this function small, since we want to keep the stack frame - // leading up to this also quite small -#ifdef COPY_STACKS - char __stk; - jl_set_base_ctx(&__stk); // separate function, to record the size of a stack frame -#endif _julia_init(rel); } @@ -309,102 +195,104 @@ static void ctx_switch(jl_ptls_t ptls, jl_task_t **pt) { jl_task_t *t = *pt; assert(t != ptls->current_task); + jl_task_t *lastt = ptls->current_task; #ifdef ENABLE_TIMINGS - jl_timing_block_t *blk = ptls->current_task->timing_stack; + jl_timing_block_t *blk = lastt->timing_stack; if (blk) jl_timing_block_stop(blk); #endif - if (!jl_setjmp(ptls->current_task->ctx, 0)) { - // backtraces don't survive task switches, see e.g. issue #12485 - ptls->bt_size = 0; - jl_task_t *lastt = ptls->current_task; -#ifdef COPY_STACKS - save_stack(ptls, lastt, pt); // allocates (gc-safepoint, and can also fail) -#else - *pt = lastt; // can't fail after here: clear the gc-root for the target task now + // backtraces don't survive task switches, see e.g. issue #12485 + ptls->bt_size = 0; +#ifdef JULIA_ENABLE_THREADING + // If the current task is not holding any locks, free the locks list + // so that it can be GC'd without leaking memory + arraylist_t *locks = &lastt->locks; + if (locks->len == 0 && locks->items != locks->_space) { + arraylist_free(locks); + arraylist_new(locks, 0); + } #endif - // set up global state for new task - lastt->gcstack = ptls->pgcstack; - lastt->world_age = ptls->world_age; - ptls->pgcstack = t->gcstack; - ptls->world_age = t->world_age; -#ifdef JULIA_ENABLE_THREADING - // If the current task is not holding any locks, free the locks list - // so that it can be GC'd without leaking memory - arraylist_t *locks = &ptls->current_task->locks; - if (locks->len == 0 && locks->items != locks->_space) { - arraylist_free(locks); - arraylist_new(locks, 0); + int started = (t->stkbuf != NULL); + int killed = (lastt->state == done_sym || lastt->state == failed_sym); + if (!started && !t->copy_stack) { + // may need to allocate the stack + t->stkbuf = jl_alloc_fiber(&t->ctx, &t->bufsz, t); + } + + if (killed) { + *pt = lastt; // can't fail after here: clear the gc-root for the target task now + lastt->gcstack = NULL; + // if (!lastt->copy_stack) { // TODO: early free of stkbuf + // jl_free_stack(lastt->stkbuf, lastt->bufsz); + // lastt->stkbuf = NULL; + // } + } + else { +#ifdef COPY_STACKS + if (lastt->copy_stack) { // save the old copy-stack + save_stack(ptls, lastt, pt); // allocates (gc-safepoint, and can also fail) + if (jl_setjmp(lastt->ctx.uc_mcontext, 0)) { +#ifdef ENABLE_TIMINGS + assert(blk == ptls->current_task->timing_stack); + if (blk) + jl_timing_block_start(blk); +#endif + return; + } } + else #endif + *pt = lastt; // can't fail after here: clear the gc-root for the target task now + lastt->gcstack = ptls->pgcstack; + } - // restore task's current module, looking at parent tasks - // if it hasn't set one. - jl_task_t *last = t; - while (last->current_module == NULL && last != ptls->root_task) { - last = last->parent; - } - if (last->current_module != NULL) { - ptls->current_module = last->current_module; - } + // set up global state for new task + lastt->world_age = ptls->world_age; + ptls->pgcstack = t->gcstack; + ptls->world_age = t->world_age; + t->gcstack = NULL; + + // DEPRECATED: + // restore task's current module, looking at parent tasks + // if it hasn't set one. + jl_task_t *last = t; + while (last->current_module == NULL && last != last->parent) + last = last->parent; + if (last->current_module != NULL) + jl_current_module = last->current_module; - ptls->current_task = t; + ptls->current_task = t; + jl_ucontext_t *lastt_ctx = (killed ? NULL : &lastt->ctx); #ifdef COPY_STACKS - if (t->stkbuf) { - restore_stack(ptls, NULL); - } - else { -#ifdef ASM_COPY_STACKS - // Start the task without `setjmp` - void *stackbase = ptls->stackbase; -#ifdef _CPU_X86_64_ -#ifdef _OS_WINDOWS_ - stackbase = (char*)stackbase - 0x20; -#endif - asm(" movq %0, %%rsp;\n" - " xorq %%rbp, %%rbp;\n" - " push %%rbp;\n" // instead of RSP - " jmp %P1;\n" // call `start_task` with fake stack frame - " ud2" - : : "r"(stackbase), "i"(&start_task) : "memory" ); -#elif defined(_CPU_X86_) - asm(" movl %0, %%esp;\n" - " xorl %%ebp, %%ebp;\n" - " push %%ebp;\n" // instead of ESP - " jmp %P1;\n" // call `start_task` with fake stack frame - " ud2" - : : "r"(stackbase), "X"(&start_task) : "memory" ); -#elif defined(_CPU_AARCH64_) - asm(" mov sp, %0;\n" - " mov x29, xzr;\n" // Clear link register (x29) and frame pointer - " mov x30, xzr;\n" // (x30) to terminate unwinder. - " b %1;\n" // call `start_task` with fake stack frame - " brk #0x1" // abort - : : "r"(stackbase), "S"(&start_task) : "memory" ); -#elif defined(_CPU_ARM_) - // A "i" constraint on `&start_task` works only on clang and not on GCC. - asm(" mov sp, %0;\n" - " mov lr, #0;\n" // Clear link register (lr) and frame pointer - " mov fp, #0;\n" // (fp) to terminate unwinder. - " b start_task;\n" // call `start_task` with fake stack frame - " udf #0" // abort - : : "r"(stackbase) : "memory" ); -#else -#error ASM_COPY_STACKS not supported on this cpu architecture -#endif -#else // ASM_COPY_STACKS - jl_longjmp(ptls->base_ctx, 1); + if (lastt->copy_stack) + // if we are switching between copy-stacks, + // don't save the old copy-stack + // instead resume at jl_setjmp of the other task, + // after restoring the stack + lastt_ctx = NULL; #endif - jl_unreachable(); + if (started) { +#ifdef COPY_STACKS + if (t->copy_stack) { + if (lastt_ctx) + restore_stack2(ptls, lastt); + else + restore_stack(ptls, NULL); // (doesn't return) } -#else - jl_longjmp(t->ctx, 1); + else #endif + if (!lastt_ctx) + jl_set_fiber(&t->ctx); + else + jl_swap_fiber(lastt_ctx, &t->ctx); + } + else { + jl_start_fiber(lastt_ctx, &t->ctx); } #ifdef ENABLE_TIMINGS - assert(blk == jl_current_task->timing_stack); + assert(blk == ptls->current_task->timing_stack); if (blk) jl_timing_block_start(blk); #endif @@ -418,7 +306,7 @@ JL_DLLEXPORT void jl_switchto(jl_task_t **pt) return; } if (t->state == done_sym || t->state == failed_sym || - (t->stkbuf == (void*)(intptr_t)-1)) { + (t->started && t->stkbuf == NULL)) { ptls->current_task->exception = t->exception; ptls->current_task->result = t->result; return; @@ -437,104 +325,6 @@ JL_DLLEXPORT void jl_switchto(jl_task_t **pt) jl_sigint_safepoint(ptls); } -#ifndef COPY_STACKS - -#ifdef __linux__ -#if defined(__i386__) -static intptr_t ptr_mangle(intptr_t p) -{ - intptr_t ret; - asm(" movl %1, %%eax;\n" - " xorl %%gs:0x18, %%eax;" - " roll $9, %%eax;" - " movl %%eax, %0;" - : "=r"(ret) : "r"(p) : "%eax"); - return ret; -} -static intptr_t ptr_demangle(intptr_t p) -{ - intptr_t ret; - asm(" movl %1, %%eax;\n" - " rorl $9, %%eax;" - " xorl %%gs:0x18, %%eax;" - " movl %%eax, %0;" - : "=r"(ret) : "r"(p) : "%eax" ); - return ret; -} -#elif defined(__x86_64__) -static intptr_t ptr_mangle(intptr_t p) -{ - intptr_t ret; - asm(" movq %1, %%rax;\n" - " xorq %%fs:0x30, %%rax;" - " rolq $17, %%rax;" - " movq %%rax, %0;" - : "=r"(ret) : "r"(p) : "%rax"); - return ret; -} -static intptr_t ptr_demangle(intptr_t p) -{ - intptr_t ret; - asm(" movq %1, %%rax;\n" - " rorq $17, %%rax;" - " xorq %%fs:0x30, %%rax;" - " movq %%rax, %0;" - : "=r"(ret) : "r"(p) : "%rax" ); - return ret; -} -#endif -#endif //__linux__ - -/* rebase any values in saved state to the new stack */ -static void rebase_state(jl_jmp_buf *ctx, intptr_t local_sp, intptr_t new_sp) -{ - intptr_t *s = (intptr_t*)ctx; - intptr_t diff = new_sp - local_sp; /* subtract old base, and add new base */ -#if defined(__linux__) && defined(__i386__) - s[3] += diff; - if (mangle_pointers) - s[4] = ptr_mangle(ptr_demangle(s[4])+diff); - else - s[4] += diff; -#elif defined(__linux__) && defined(__x86_64__) - if (mangle_pointers) { - s[1] = ptr_mangle(ptr_demangle(s[1])+diff); - s[6] = ptr_mangle(ptr_demangle(s[6])+diff); - } - else { - s[1] += diff; - s[6] += diff; - } -#elif defined(__APPLE__) && defined(__i386__) - s[8] += diff; - s[9] += diff; -#elif defined(__APPLE__) && defined(__x86_64__) - s[1] += diff; - s[2] += diff; -#else -#error "COPY_STACKS must be defined on this platform." -#endif -} -static void init_task(jl_task_t *t, char *stack) -{ - if (jl_setjmp(t->ctx, 0)) { - start_task(); - } - // this runs when the task is created - intptr_t local_sp = (intptr_t)&t; - intptr_t new_sp = (intptr_t)stack + t->ssize - _frame_offset; -#ifdef _P64 - // SP must be 16-byte aligned - new_sp = new_sp&-16; - local_sp = local_sp&-16; -#endif - memcpy((void*)new_sp, (void*)local_sp, _frame_offset); - rebase_state(&t->ctx, local_sp, new_sp); -} - -#endif /* !COPY_STACKS */ - -jl_timing_block_t *jl_pop_timing_block(jl_timing_block_t *cur_block); JL_DLLEXPORT JL_NORETURN void jl_no_exc_handler(jl_value_t *e) JL_NOTSAFEPOINT { jl_printf(JL_STDERR, "fatal: error thrown and no exception handler available.\n"); @@ -544,6 +334,8 @@ JL_DLLEXPORT JL_NORETURN void jl_no_exc_handler(jl_value_t *e) JL_NOTSAFEPOINT jl_exit(1); } +jl_timing_block_t *jl_pop_timing_block(jl_timing_block_t *cur_block); + // yield to exception handler void JL_NORETURN throw_internal(jl_value_t *e JL_MAYBE_UNROOTED) { @@ -595,15 +387,21 @@ JL_DLLEXPORT void jl_rethrow_other(jl_value_t *e) JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, size_t ssize) { jl_ptls_t ptls = jl_get_ptls_states(); - size_t pagesz = jl_page_size; - jl_task_t *t = (jl_task_t*)jl_gc_alloc(ptls, sizeof(jl_task_t), - jl_task_type); -#ifndef COPY_STACKS - if (ssize == 0) // unspecified -- pick some default size - ssize = 1*1024*1024; // 1M (for now) -#endif - ssize = LLT_ALIGN(ssize, pagesz); - t->ssize = ssize; + jl_task_t *t = (jl_task_t*)jl_gc_alloc(ptls, sizeof(jl_task_t), jl_task_type); + t->copy_stack = 0; + if (ssize == 0) { +#ifdef COPY_STACKS + t->copy_stack = 1; + t->bufsz = 0; +#else + t->bufsz = JL_STACK_SIZE; // unspecified -- use the default size +#endif + } + else { + if (ssize < MINSTKSZ) + ssize = MINSTKSZ; + t->bufsz = ssize; + } t->current_module = NULL; t->parent = ptls->current_task; t->tls = jl_nothing; @@ -617,6 +415,7 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, size_t ssize) t->logstate = ptls->current_task->logstate; // there is no active exception handler available on this stack yet t->eh = NULL; + t->tid = 0; t->gcstack = NULL; t->stkbuf = NULL; t->tid = 0; @@ -624,42 +423,20 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, size_t ssize) #ifdef ENABLE_TIMINGS t->timing_stack = NULL; #endif - -#ifdef COPY_STACKS - t->bufsz = 0; -#else - JL_GC_PUSH1(&t); - - size_t stkbuf_sz = ssize + pagesz + (pagesz - 1); - char *stk = (char*)jl_gc_alloc_buf(ptls, stkbuf_sz); - t->stkbuf = stk; - jl_gc_wb_buf(t, t->stkbuf, stkbuf_sz); - stk = (char*)LLT_ALIGN((uintptr_t)stk, pagesz); - // add a guard page to detect stack overflow - if (mprotect(stk, pagesz-1, PROT_NONE) == -1) - jl_errorf("mprotect: %s", strerror(errno)); - stk += pagesz; - - init_task(t, stk); - jl_gc_add_finalizer((jl_value_t*)t, jl_unprotect_stack_func); - JL_GC_POP(); -#endif - #ifdef JULIA_ENABLE_THREADING arraylist_new(&t->locks, 0); #endif - return t; -} -#ifndef COPY_STACKS -static void jl_unprotect_stack(jl_task_t *t) -{ - size_t pagesz = jl_page_size; - char *stk = (char*)LLT_ALIGN((uintptr_t)t->stkbuf, pagesz); - // unprotect stack so it can be reallocated for something else - mprotect(stk, pagesz - 1, PROT_READ|PROT_WRITE); -} +#if defined(JL_DEBUG_BUILD) + if (!t->copy_stack) + memset(&t->ctx, 0, sizeof(t->ctx)); #endif +#ifdef COPY_STACKS + if (t->copy_stack) + memcpy(&t->ctx, &ptls->base_ctx, sizeof(t->ctx)); +#endif + return t; +} JL_DLLEXPORT jl_value_t *jl_get_current_task(void) { @@ -667,12 +444,9 @@ JL_DLLEXPORT jl_value_t *jl_get_current_task(void) return (jl_value_t*)ptls->current_task; } -jl_function_t *jl_unprotect_stack_func; - // Do one-time initializations for task system void jl_init_tasks(void) JL_GC_DISABLED { - _probe_arch(); jl_task_type = (jl_datatype_t*) jl_new_datatype(jl_symbol("Task"), NULL, @@ -704,26 +478,345 @@ void jl_init_tasks(void) JL_GC_DISABLED done_sym = jl_symbol("done"); failed_sym = jl_symbol("failed"); runnable_sym = jl_symbol("runnable"); +} + +static void NOINLINE JL_NORETURN start_task(void) +{ + // this runs the first time we switch to a task + jl_ptls_t ptls = jl_get_ptls_states(); + jl_task_t *t = ptls->current_task; + jl_value_t *res; + t->started = 1; + if (t->exception != jl_nothing) { + record_backtrace(); + res = t->exception; + } + else { + JL_TRY { + if (ptls->defer_signal) { + ptls->defer_signal = 0; + jl_sigint_safepoint(ptls); + } + JL_TIMING(ROOT); + ptls->world_age = jl_world_counter; + res = jl_apply(&t->start, 1); + } + JL_CATCH { + res = jl_exception_in_transit; + t->exception = res; + jl_gc_wb(t, res); + } + } + finish_task(t, res); + gc_debug_critical_error(); + abort(); +} + -#ifndef COPY_STACKS - jl_unprotect_stack_func = jl_box_voidpointer(&jl_unprotect_stack); +#if defined(JL_HAVE_UCONTEXT) +#ifdef _OS_WINDOWS_ +#define setcontext jl_setcontext +#define getcontext jl_getcontext +#define swapcontext jl_swapcontext +#define makecontext jl_makecontext +#endif +static char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner) +{ +#ifndef _OS_WINDOWS_ + int r = getcontext(t); + if (r != 0) + jl_error("getcontext failed"); #endif + void *stk = jl_malloc_stack(ssize, owner); + t->uc_stack.ss_sp = stk; + t->uc_stack.ss_size = *ssize; +#ifdef _OS_WINDOWS_ + makecontext(t, &start_task); +#else + t->uc_link = NULL; + makecontext(t, &start_task, 0); +#endif + return (char*)stk; +} +static void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t) +{ + if (lastt) + swapcontext(lastt, t); + else + setcontext(t); +} +static void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t) +{ + swapcontext(lastt, t); +} +static void jl_set_fiber(jl_ucontext_t *t) +{ + setcontext(t); } +static void jl_init_basefiber(size_t ssize) +{ + jl_ptls_t ptls = jl_get_ptls_states(); + char *stkbuf = jl_alloc_fiber(&ptls->base_ctx, &ssize, NULL); + ptls->stackbase = stkbuf + ssize; + ptls->stacksize = ssize; +} +#endif + +#if defined(JL_HAVE_UNW_CONTEXT) +static void start_basefiber(void) +{ + jl_ptls_t ptls = jl_get_ptls_states(); + if (jl_setjmp(ptls->base_ctx.uc_mcontext, 0)) + start_task(); + jl_longjmp(jl_root_task->ctx.uc_mcontext, 1); +} +#if defined(_CPU_X86_) || defined(_CPU_X86_64_) +#define PUSH_RET(ctx, stk) \ + do { \ + stk -= sizeof(uintptr_t); \ + *(uintptr_t*)stk = 0; /* push null RIP/EIP onto the stack */ \ + } while (0) +#elif defined(_CPU_ARM_) +#define PUSH_RET(ctx, stk) \ + unw_set_reg(ctx, UNW_ARM_R14, 0) /* put NULL into the LR */ +#else +#error please define how to simulate a CALL on this platform +#endif +static char *jl_alloc_fiber(unw_context_t *t, size_t *ssize, jl_task_t *owner) +{ + char *stkbuf = (char*)jl_malloc_stack(ssize, owner); + char *stk = stkbuf; + stk += *ssize; + PUSH_RET(&jl_basecursor, stk); + if (unw_set_reg(&jl_basecursor, UNW_REG_SP, (uintptr_t)stk) != 0) { + jl_free_stack((void*)stkbuf, *ssize); + jl_error("unw_set_reg UNW_REG_SP failed"); + } + uintptr_t fn; + if (t == &ptls->base_ctx) + fn = (uintptr_t)&start_basefiber; + else + fn = (uintptr_t)&start_task; + if (unw_set_reg(&jl_basecursor, UNW_REG_IP, fn) != 0) { + jl_free_stack((void*)stkbuf, *ssize); + jl_error("unw_set_reg UNW_REG_IP failed"); + } + return stkbuf; +} +static void jl_start_fiber(unw_context_t *lastt, unw_context_t *t) +{ + if (lastt && jl_setjmp(lastt->uc_mcontext, 0)) + return; + unw_resume(&jl_basecursor); // (doesn't return) +} +static void jl_swap_fiber(unw_context_t *lastt, unw_context_t *t) +{ + if (jl_setjmp(lastt->uc_mcontext, 0)) + return; + jl_longjmp(t->uc_mcontext, 1); // (doesn't return) +} +static void jl_set_fiber(unw_context_t *t) +{ + jl_longjmp(t->uc_mcontext, 1); +} +static void jl_init_basefiber(size_t ssize) +{ + int r = unw_getcontext(&ptls->base_ctx); + if (r != 0) + jl_error("unw_getcontext failed"); + r = unw_init_local(&jl_basecursor, &ptls->base_ctx); + if (r != 0) + jl_error("unw_init_local failed"); +#ifdef COPY_STACKS + jl_ptls_t ptls = jl_get_ptls_states(); + char *stkbuf = jl_alloc_fiber(&ptls->base_ctx, &ssize, NULL); + ptls->stackbase = stkbuf + ssize; + ptls->stacksize = ssize; + jl_start_fiber(jl_root_task, &ptls->base_ctx); // finishes initializing jl_basectx +#endif +} +#endif + +#if defined(JL_HAVE_ASM) +static char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner) +{ + char *stkbuf = (char*)jl_malloc_stack(ssize, owner); + ((char**)t)[0] = stkbuf; // stash the stack pointer somewhere for start_fiber + ((size_t*)t)[1] = *ssize; // stash the stack size somewhere for start_fiber + return stkbuf; +} +static void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t) +{ + if (lastt && jl_setjmp(lastt->uc_mcontext, 0)) + return; + char *stk = ((char**)t)[0]; + size_t ssize = ((size_t*)t)[1]; + uintptr_t fn = (uintptr_t)&start_task; + stk += ssize; +#ifdef _CPU_X86_64_ + asm volatile ( + " movq %0, %%rsp;\n" + " movq %1, %%rax;\n" + " xorq %%rbp, %%rbp;\n" + " push %%rbp;\n" // instead of RSP + " jmpq *%%rax;\n" // call `fn` with fake stack frame + " ud2" + : : "r"(stk), "r"(fn) : "memory" ); +#elif defined(_CPU_X86_) + asm volatile ( + " movl %0, %%esp;\n" + " movl %1, %%eax;\n" + " xorl %%ebp, %%ebp;\n" + " push %%ebp;\n" // instead of ESP + " jmpl *%%eax;\n" // call `fn` with fake stack frame + " ud2" + : : "r"(stk), "r"(fn) : "memory" ); +#elif defined(_CPU_AARCH64_) + asm volatile( + " mov sp, %0;\n" + " mov x29, xzr;\n" // Clear link register (x29) and frame pointer + " mov x30, xzr;\n" // (x30) to terminate unwinder. + " br %1;\n" // call `fn` with fake stack frame + " brk #0x1" // abort + : : "r" (stk), "r"(fn) : "memory" ); +#elif defined(_CPU_ARM_) + // A "i" constraint on `&start_task` works only on clang and not on GCC. + asm(" mov sp, %0;\n" + " mov lr, #0;\n" // Clear link register (lr) and frame pointer + " mov fp, #0;\n" // (fp) to terminate unwinder. + " br %1;\n" // call `fn` with fake stack frame + " udf #0" // abort + : : "r" (stk), "r"(fn) : "memory" ); +#else +#error JL_HAVE_ASM defined but not implemented for this CPU type +#endif + __builtin_unreachable(); +} +static void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t) +{ + if (jl_setjmp(lastt->uc_mcontext, 0)) + return; + jl_longjmp(t->uc_mcontext, 1); // (doesn't return) +} +static void jl_set_fiber(jl_ucontext_t *t) +{ + jl_longjmp(t->uc_mcontext, 1); +} +static void jl_init_basefiber(size_t ssize) +{ +#ifdef COPY_STACKS + jl_ptls_t ptls = jl_get_ptls_states(); + char *stkbuf = jl_alloc_fiber(&ptls->base_ctx, &ssize, NULL); + ptls->stackbase = stkbuf + ssize; + ptls->stacksize = ssize; +#endif +} +#endif + +#if defined(JL_HAVE_SIGALTSTACK) +static void start_basefiber(void) +{ + jl_ptls_t ptls = jl_get_ptls_states(); + if (jl_setjmp(ptls->base_ctx.uc_mcontext, 0)) + start_task(); +} +static char *jl_alloc_fiber(jl_ucontext_t *t, size_t *ssize, jl_task_t *owner) +{ + stack_t uc_stack, osigstk; + struct sigaction sa, osa; + sigset_t set, oset; + void *stk = jl_malloc_stack(ssize, owner); + // setup + jl_ucontext_t base_ctx; + memcpy(&base_ctx, &ptls->base_ctx, sizeof(ptls->base_ctx)); + sigfillset(&set); + if (sigprocmask(SIG_BLOCK, &set, &oset) != 0) { + jl_free_stack(stk, *ssize); + jl_error("sigprocmask failed"); + } + uc_stack.ss_sp = stk; + uc_stack.ss_size = *ssize; + uc_stack.ss_flags = 0; + if (sigaltstack(&uc_stack, &osigstk) != 0) { + jl_free_stack(stk, *ssize); + jl_error("sigaltstack failed"); + } + memset(&sa, 0, sizeof(sa)); + sigemptyset(&sa.sa_mask); + sa.sa_handler = start_basefiber; + sa.sa_flags = SA_ONSTACK; + if (sigaction(SIGUSR2, &sa, &osa) != 0) { + jl_free_stack(stk, *ssize); + jl_error("sigaction failed"); + } + // emit signal + pthread_kill(pthread_self(), SIGUSR2); // initializes jl_basectx + sigdelset(&set, SIGUSR2); + sigsuspend(&set); + // cleanup + if (sigaction(SIGUSR2, &osa, NULL) != 0) { + jl_free_stack(stk, *ssize); + jl_error("sigaction failed"); + } + if (osigstk.ss_size < MINSTKSZ && (osigstk.ss_flags | SS_DISABLE)) + osigstk.ss_size = MINSTKSZ; + if (sigaltstack(&osigstk, NULL) != 0) { + jl_free_stack(stk, *ssize); + jl_error("sigaltstack failed"); + } + if (sigprocmask(SIG_SETMASK, &oset, NULL) != 0) { + jl_free_stack(stk, *ssize); + jl_error("sigprocmask failed"); + } + memcpy(&t, &ptls->base_ctx, sizeof(ptls->base_ctx)); + memcpy(&ptls->base_ctx, &base_ctx, sizeof(ptls->base_ctx)); + return (char*)stk; +} +static void jl_start_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t) +{ + if (lastt && jl_setjmp(lastt->uc_mcontext, 0)) + return; + jl_longjmp(t->uc_mcontext, 1); // (doesn't return) +} +static void jl_swap_fiber(jl_ucontext_t *lastt, jl_ucontext_t *t) +{ + if (jl_setjmp(lastt->uc_mcontext, 0)) + return; + jl_longjmp(t->uc_mcontext, 1); // (doesn't return) +} +static void jl_set_fiber(jl_ucontext_t *t) +{ + jl_longjmp(t->uc_mcontext, 1); +} +static void jl_init_basefiber(size_t ssize) +{ +#ifdef COPY_STACKS + jl_ptls_t ptls = jl_get_ptls_states(); + char *stkbuf = jl_alloc_fiber(jl_root_task, &ssize, NULL); + ptls->stackbase = stkbuf + ssize; + ptls->stacksize = ssize; + memcpy(&ptls->base_ctx, &jl_root_task->ctx, sizeof(ptls->base_ctx)); +#endif +} +#endif // Initialize a root task using the given stack. -void jl_init_root_task(void *stack, size_t ssize) +void jl_init_root_task(void *stack_lo, void *stack_hi) { jl_ptls_t ptls = jl_get_ptls_states(); ptls->current_task = (jl_task_t*)jl_gc_alloc(ptls, sizeof(jl_task_t), jl_task_type); -#ifdef COPY_STACKS - ptls->current_task->ssize = 0; // size of saved piece - ptls->current_task->bufsz = 0; - ptls->current_task->stkbuf = NULL; -#else - ptls->current_task->ssize = ssize; - ptls->current_task->stkbuf = stack; + ptls->current_task->copy_stack = 0; + void *stack = stack_lo; + size_t ssize = (char*)stack_hi - (char*)stack_lo; +#ifndef _OS_WINDOWS_ + if (ptls->tid == 0) { + stack = (void*)((char*)stack - 3000000); // offset our guess of the address of the bottom of stack to cover the guard pages too + ssize += 3000000; // sizeof stack is known exactly, but not where we are in that stack + } #endif + ptls->current_task->stkbuf = stack; + ptls->current_task->bufsz = ssize; ptls->current_task->started = 1; ptls->current_task->parent = ptls->current_task; ptls->current_task->current_module = ptls->current_module; @@ -742,9 +835,10 @@ void jl_init_root_task(void *stack, size_t ssize) arraylist_new(&ptls->current_task->locks, 0); #endif + ptls->exception_in_transit = (jl_value_t*)jl_nothing; ptls->root_task = ptls->current_task; - ptls->exception_in_transit = (jl_value_t*)jl_nothing; + jl_init_basefiber(JL_STACK_SIZE); } JL_DLLEXPORT int jl_is_task_started(jl_task_t *t) @@ -752,6 +846,49 @@ JL_DLLEXPORT int jl_is_task_started(jl_task_t *t) return t->started; } +#ifdef _OS_WINDOWS_ +#if defined(_CPU_X86_) +extern DWORD32 __readgsdword(int); +extern DWORD32 __readgs(void); +#endif +JL_DLLEXPORT void jl_gdb_dump_threadinfo(void) +{ +#if defined(_CPU_X86_64_) + DWORD64 gs0 = __readgsqword(0x0); + DWORD64 gs8 = __readgsqword(0x8); + DWORD64 gs16 = __readgsqword(0x10); + jl_safe_printf("ThreadId: %u, Stack: %p -- %p to %p, SEH: %p\n", + (unsigned)GetCurrentThreadId(), + jl_get_frame_addr(), + (void*)gs8, (void*)gs16, (void*)gs0); +#elif defined(_CPU_X86_) + DWORD32 fs0 = __readfsdword(0x0); + DWORD32 fs4 = __readfsdword(0x4); + DWORD32 fs8 = __readfsdword(0x8); + jl_safe_printf("ThreadId: %u, Stack: %p -- %p to %p, SEH: %p\n", + (unsigned)GetCurrentThreadId(), + jl_get_frame_addr(), + (void*)fs4, (void*)fs8, (void*)fs0); + if (__readgs()) { // WoW64 if GS is non-zero + DWORD32 gs0 = __readgsdword(0x0); + DWORD32 gs4 = __readgsdword(0x4); + DWORD32 gs8 = __readgsdword(0x8); + DWORD32 gs12 = __readgsdword(0xc); + DWORD32 gs16 = __readgsdword(0x10); + DWORD32 gs20 = __readgsdword(0x14); + jl_safe_printf("Stack64: %p%p to %p%p, SEH64: %p%p\n", + (void*)gs12, (void*)gs8, + (void*)gs20, (void*)gs16, + (void*)gs4, (void*)gs0); + } +#else + jl_safe_printf("ThreadId: %u, Stack: %p\n", + (unsigned)GetCurrentThreadId(), + jl_get_frame_addr()); +#endif +} +#endif + #ifdef __cplusplus } #endif diff --git a/src/threading.c b/src/threading.c index e75c0949d9ce8..f41f85f11a0ea 100644 --- a/src/threading.c +++ b/src/threading.c @@ -361,13 +361,11 @@ void ti_threadfun(void *arg) // initialize this thread (set tid, create heap, etc.) ti_initthread(ta->tid); - jl_init_stack_limits(0); + void *stack_lo, *stack_hi; + jl_init_stack_limits(0, &stack_lo, &stack_hi); // set up tasking - jl_init_root_task(ptls->stack_lo, ptls->stack_hi - ptls->stack_lo); -#ifdef COPY_STACKS - jl_set_base_ctx((char*)&arg); -#endif + jl_init_root_task(stack_lo, stack_hi); // set the thread-local tid and wait for a thread group while (jl_atomic_load_acquire(&ta->state) == TI_THREAD_INIT) diff --git a/test/stack_overflow.jl b/test/stack_overflow.jl new file mode 100644 index 0000000000000..33d667fa479ab --- /dev/null +++ b/test/stack_overflow.jl @@ -0,0 +1,19 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +using Test + +# helper function for returning stderr and stdout +# from running a command (ignoring failure status) +function readchomperrors(exename::Cmd) + out = Base.PipeEndpoint() + err = Base.PipeEndpoint() + p = run(exename, devnull, out, err, wait=false) + o = @async(readchomp(out)) + e = @async(readchomp(err)) + return (success(p), fetch(o), fetch(e)) +end + +let exename = Base.julia_cmd() + @show readchomperrors(`$exename -e "f() = f(); f()"`) + @show readchomperrors(`$exename -e "f() = f(); fetch(@schedule f())"`) +end From 8e043280c311aeeda1cc2999505b658678fb4ea9 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 16 May 2018 21:59:59 -0400 Subject: [PATCH 2/2] early free of non-copy-stacks back to pool --- src/gc-stacks.c | 14 ++++++++++++++ src/task.c | 10 ++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/gc-stacks.c b/src/gc-stacks.c index 8515fb7797be6..998e763049164 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -101,6 +101,20 @@ JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz) } +void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task) +{ + void *stkbuf = task->stkbuf; + size_t bufsz = task->bufsz; + if (bufsz <= pool_sizes[JL_N_STACK_POOLS - 1]) { + unsigned pool_id = select_pool(bufsz); + if (pool_sizes[pool_id] == bufsz) { + task->stkbuf = NULL; + arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf); + } + } +} + + JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) { jl_ptls_t ptls = jl_get_ptls_states(); diff --git a/src/task.c b/src/task.c index 75656b82d0da7..04d6157b06d0b 100644 --- a/src/task.c +++ b/src/task.c @@ -191,6 +191,8 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel) _julia_init(rel); } +void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task); + static void ctx_switch(jl_ptls_t ptls, jl_task_t **pt) { jl_task_t *t = *pt; @@ -223,10 +225,10 @@ static void ctx_switch(jl_ptls_t ptls, jl_task_t **pt) if (killed) { *pt = lastt; // can't fail after here: clear the gc-root for the target task now lastt->gcstack = NULL; - // if (!lastt->copy_stack) { // TODO: early free of stkbuf - // jl_free_stack(lastt->stkbuf, lastt->bufsz); - // lastt->stkbuf = NULL; - // } + if (!lastt->copy_stack && lastt->stkbuf) { + // early free of stkbuf back to the pool + jl_release_task_stack(ptls, lastt); + } } else { #ifdef COPY_STACKS