diff --git a/contrib/bpftrace/rt_all.bt b/contrib/bpftrace/rt_all.bt new file mode 100755 index 00000000000000..d4de28e354a508 --- /dev/null +++ b/contrib/bpftrace/rt_all.bt @@ -0,0 +1,81 @@ +#!/usr/bin/env bpftrace + +BEGIN +{ + printf("Tracing Julia Task events... Hit Ctrl-C to end.\n"); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__run__task +{ + printf("Task running: %x\n", arg0); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__pause__task +{ + printf("Task pausing: %x\n", arg0); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__new__task +{ + printf("Task created: %x (Parent %x)\n", arg1, arg0); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__start__task +{ + printf("Task started: %x\n", arg0); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__finish__task +{ + printf("Task finished: %x\n", arg0); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__start__process__events +{ + printf("Task processing libuv events: %x\n", arg0); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__finish__process__events +{ + printf("Task processed libuv events: %x\n", arg0); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__taskq__insert +{ + printf("Thread %x inserting task to multiq: %x\n", arg0, arg1); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__taskq__get +{ + printf("Thread %x popped task from multiq: %x\n", arg0, arg1); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__sleep__check__wake +{ + printf("Thread waking: %x (was sleeping?: %d)\n", arg0, arg1); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__sleep__check__wakeup +{ + printf("Thread wakeup: %x\n", arg0); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__sleep__check__sleep +{ + printf("Thread trying to sleep: %x\n", arg0); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__sleep__check__taskq__wake +{ + printf("Thread waking due to non-empty task queue: %x\n", arg0); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__sleep__check__task__wake +{ + printf("Thread waking due to popped task: %x\n", arg0); +} + +usdt:usr/lib/libjulia-internal.so:julia:rt__sleep__check__uv__wake +{ + printf("Thread waking due to libuv: %x\n", arg0); +} diff --git a/doc/src/devdocs/probes.md b/doc/src/devdocs/probes.md index 7e2fb96df5ac8d..c5cd21b7c91db4 100644 --- a/doc/src/devdocs/probes.md +++ b/doc/src/devdocs/probes.md @@ -90,11 +90,37 @@ the probe handler. 3. `julia:gc__mark__begin`: Beginning the mark phase 4. `julia:gc__mark_end(scanned_bytes, perm_scanned)`: Mark phase ended 5. `julia:gc__sweep_begin(full)`: Starting sweep -6. `julia:gc__sweep_end()`: Sweep phase finished +6. `julia:gc__sweep_end`: Sweep phase finished 7. `julia:gc__end`: GC is finished, other threads continue work 8. `julia:gc__finalizer`: Initial GC thread has finished running finalizers -#### GC stop-the-world latency +### Task runtime probes + +1. `julia:rt__run__task(task)`: Switching to task `task` on current thread. +2. `julia:rt__pause__task(task)`: Switching from task `task` on current thread. +3. `julia:rt__new__task(parent, child)`: Task `parent` created task `child` on current thread. +4. `julia:rt__start__task(task)`: Task `task` started for the first time with a new stack. +5. `julia:rt__finish__task(task)`: Task `task` finished and will no longer execute. +6. `julia:rt__start__process__events(task)`: Task `task` started processing libuv events. +7. `julia:rt__finish__process__events(task)`: Task `task` finished processing libuv events. + +### Task queue probes + +1. `julia:rt__taskq__insert(ptls, task)`: Thread `ptls` attempted to insert `task` into a PARTR multiq. +2. `julia:rt__taskq__get(ptls, task)`: Thread `ptls` popped `task` from a PARTR multiq. + +### Thread sleep/wake probes + +1. `julia:rt__sleep__check__wake(ptls, old_state)`: Thread (PTLS `ptls`) waking up, previously in state `old_state`. +2. `julia:rt__sleep__check__wakeup(ptls)`: Thread (PTLS `ptls`) woke itself up. +3. `julia:rt__sleep__check__sleep(ptls)`: Thread (PTLS `ptls`) is attempting to sleep. +4. `julia:rt__sleep__check__taskq__wake(ptls)`: Thread (PTLS `ptls`) fails to sleep due to tasks in PARTR multiq. +5. `julia:rt__sleep__check__task__wake(ptls)`: Thread (PTLS `ptls`) fails to sleep due to tasks in Base workqueue. +6. `julia:rt__sleep__check__uv__wake(ptls)`: Thread (PTLS `ptls`) fails to sleep due to libuv wakeup. + +## Probe usage examples + +### GC stop-the-world latency An example `bpftrace` script is given in `contrib/gc_stop_the_world_latency.bt` and it creates a histogram of the latency for all threads to reach a safepoint. @@ -143,6 +169,136 @@ Tracing Julia GC Stop-The-World Latency... Hit Ctrl-C to end. We can see the latency distribution of the stop-the-world phase in the executed Julia process. +### Task spawn monitor + +It's sometimes useful to know when a task is spawning other tasks. This is very +easy to see with `rt__new__task`. The first argument to the probe, `parent`, is +the existing task which is creating a new task. This means that if you know the +address of the task you want to monitor, you can easily just look at the tasks +that that specific task spawned. Let's see how to do this; first let's start a +Julia session and get the PID and REPL's task address: + +``` +> julia + _ + _ _ _(_)_ | Documentation: https://docs.julialang.org + (_) | (_) (_) | + _ _ _| |_ __ _ | Type "?" for help, "]?" for Pkg help. + | | | | | | |/ _` | | + | | |_| | | | (_| | | Version 1.6.2 (2021-07-14) + _/ |\__'_|_|_|\__'_| | Official https://julialang.org/ release +|__/ | + +1> getpid() +997825 + +2> current_task() +Task (runnable) @0x00007f524d088010 +``` + +Now we can start `bpftrace` and have it monitor `rt__new__task` for *only* this parent: + +`sudo bpftrace -p 997825 -e 'usdt:usr/lib/libjulia-internal.so:julia:rt__new__task /arg0==0x00007f524d088010/{ printf("Task: %x\n", arg0); }'` + +(Note that in the above, `arg0` is the first argument, `parent`). + +And if we spawn a single task: + +`@async 1+1` + +we see this task being created: + +`Task: 4d088010` + +However, if we spawn a bunch of tasks from that newly-spawned task: + +```julia +@async for i in 1:10 + @async 1+1 +end +``` + +we still only see one task from `bpftrace`: + +`Task: 4d088010` + +and it's still the same task we were monitoring! Of course, we can remove this +filter to see *all* newly-created tasks just as easily: + +`sudo bpftrace -p 997825 -e 'usdt:usr/lib/libjulia-internal.so:julia:rt__new__task { printf("Task: %x\n", arg0); }'` + +``` +Task: 4d088010 +Task: 4dc4e290 +Task: 4dc4e290 +Task: 4dc4e290 +Task: 4dc4e290 +Task: 4dc4e290 +Task: 4dc4e290 +Task: 4dc4e290 +Task: 4dc4e290 +Task: 4dc4e290 +Task: 4dc4e290 +``` + +We can see our root task, and the newly-spawned task as the parent of the ten +even newer tasks. + +### Thundering herd detection + +Task runtimes can often suffer from the "thundering herd" problem: when some +work is added to a quiet task runtime, all threads may be woken up from their +slumber, even if there isn't enough work for each thread to process. This can +cause extra latency and CPU cycles while all threads awaken (and simultaneously +go back to sleep, not finding any work to execute). + +We can see this problem illustrated with `bpftrace` quite easily. First, in one terminal we start Julia with multiple threads (6 in this example), and get the PID of that process: + +``` +> julia -t 6 + _ + _ _ _(_)_ | Documentation: https://docs.julialang.org + (_) | (_) (_) | + _ _ _| |_ __ _ | Type "?" for help, "]?" for Pkg help. + | | | | | | |/ _` | | + | | |_| | | | (_| | | Version 1.6.2 (2021-07-14) + _/ |\__'_|_|_|\__'_| | Official https://julialang.org/ release +|__/ | + +1> getpid() +997825 +``` + +And in another terminal we start `bpftrace` monitoring our process, +specifically probing the `rt__sleep__check__wake` hook: + +`sudo bpftrace -p 997825 -e 'usdt:usr/lib/libjulia-internal.so:julia:rt__sleep__check__wake { printf("Thread wake up! %x\n", arg0); }'` + +Now, we create and execute a single task in Julia: + +`Threads.@spawn 1+1` + +And in `bpftrace` we see printed out something like: + +``` +Thread wake up! 3f926100 +Thread wake up! 3ebd5140 +Thread wake up! 3f876130 +Thread wake up! 3e2711a0 +Thread wake up! 3e312190 +``` + +Even though we only spawned a single task (which only one thread could process +at a time), we woke up all of our other threads! In the future, a smarter task +runtime might only wake up a single thread (or none at all; the spawning thread +could execute this task!), and we should see this behavior go away. + +### Task Monitor with BPFnative.jl + +BPFnative.jl is able to attach to USDT probe points just like `bpftrace`. There +is a demo available for monitoring the task runtime, GC, and thread sleep/wake +transitions [here](https://github.com/jpsamaroo/BPFnative.jl/blob/master/examples/task-runtime.jl). + ## Notes on using `bpftrace` An example probe in the bpftrace format looks like: diff --git a/src/jl_uv.c b/src/jl_uv.c index e803e6263142d5..d2372b59c4670d 100644 --- a/src/jl_uv.c +++ b/src/jl_uv.c @@ -205,8 +205,10 @@ JL_DLLEXPORT int jl_process_events(void) jl_gc_safepoint_(ct->ptls); if (loop && (jl_atomic_load_relaxed(&_threadedregion) || jl_atomic_load_relaxed(&ct->tid) == 0)) { if (jl_atomic_load_relaxed(&jl_uv_n_waiters) == 0 && jl_mutex_trylock(&jl_uv_mutex)) { + JL_PROBE_RT_START_PROCESS_EVENTS(ct); loop->stop_flag = 0; int r = uv_run(loop, UV_RUN_NOWAIT); + JL_PROBE_RT_FINISH_PROCESS_EVENTS(ct); JL_UV_UNLOCK(); return r; } diff --git a/src/julia_internal.h b/src/julia_internal.h index 64710a579c1f8c..5b7b81e7a66c56 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -1543,6 +1543,21 @@ uint16_t __gnu_f2h_ieee(float param) JL_NOTSAFEPOINT; #define JL_PROBE_GC_SWEEP_END() do ; while (0) #define JL_PROBE_GC_END() do ; while (0) #define JL_PROBE_GC_FINALIZER() do ; while (0) +#define JL_PROBE_RT_RUN_TASK(task) do ; while (0) +#define JL_PROBE_RT_PAUSE_TASK(task) do ; while (0) +#define JL_PROBE_RT_NEW_TASK(parent, child) do ; while (0) +#define JL_PROBE_RT_START_TASK(task) do ; while (0) +#define JL_PROBE_RT_FINISH_TASK(task) do ; while (0) +#define JL_PROBE_RT_START_PROCESS_EVENTS(task) do ; while (0) +#define JL_PROBE_RT_FINISH_PROCESS_EVENTS(task) do ; while (0) +#define JL_PROBE_RT_TASKQ_INSERT(ptls, task) do ; while (0) +#define JL_PROBE_RT_TASKQ_GET(ptls, task) do ; while (0) +#define JL_PROBE_RT_SLEEP_CHECK_WAKE(other, old_state) do ; while (0) +#define JL_PROBE_RT_SLEEP_CHECK_WAKEUP(ptls) do ; while (0) +#define JL_PROBE_RT_SLEEP_CHECK_SLEEP(ptls) do ; while (0) +#define JL_PROBE_RT_SLEEP_CHECK_TASKQ_WAKE(ptls) do ; while (0) +#define JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls) do ; while (0) +#define JL_PROBE_RT_SLEEP_CHECK_UV_WAKE(ptls) do ; while (0) #define JL_PROBE_GC_BEGIN_ENABLED() (0) #define JL_PROBE_GC_STOP_THE_WORLD_ENABLED() (0) @@ -1552,6 +1567,21 @@ uint16_t __gnu_f2h_ieee(float param) JL_NOTSAFEPOINT; #define JL_PROBE_GC_SWEEP_END_ENABLED() (0) #define JL_PROBE_GC_END_ENABLED() (0) #define JL_PROBE_GC_FINALIZER_ENABLED() (0) +#define JL_PROBE_RT_RUN_TASK_ENABLED() (0) +#define JL_PROBE_RT_PAUSE_TASK_ENABLED() (0) +#define JL_PROBE_RT_NEW_TASK_ENABLED() (0) +#define JL_PROBE_RT_START_TASK_ENABLED() (0) +#define JL_PROBE_RT_FINISH_TASK_ENABLED() (0) +#define JL_PROBE_RT_START_PROCESS_EVENTS_ENABLED() (0) +#define JL_PROBE_RT_FINISH_PROCESS_EVENTS_ENABLED() (0) +#define JL_PROBE_RT_TASKQ_INSERT_ENABLED() (0) +#define JL_PROBE_RT_TASKQ_GET_ENABLED() (0) +#define JL_PROBE_RT_SLEEP_CHECK_WAKE_ENABLED() (0) +#define JL_PROBE_RT_SLEEP_CHECK_WAKEUP_ENABLED() (0) +#define JL_PROBE_RT_SLEEP_CHECK_SLEEP_ENABLED() (0) +#define JL_PROBE_RT_SLEEP_CHECK_TASKQ_WAKE_ENABLED() (0) +#define JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE_ENABLED() (0) +#define JL_PROBE_RT_SLEEP_CHECK_UV_WAKE_ENABLED() (0) #endif #endif diff --git a/src/partr.c b/src/partr.c index 22c9c15605f210..c8cc3245ebb4c0 100644 --- a/src/partr.c +++ b/src/partr.c @@ -302,9 +302,12 @@ void jl_threadfun(void *arg) // enqueue the specified task for execution JL_DLLEXPORT int jl_enqueue_task(jl_task_t *task) { + char failed; if (multiq_insert(task, task->prio) == -1) - return 1; - return 0; + failed = 1; + failed = 0; + JL_PROBE_RT_TASKQ_INSERT(jl_current_task->ptls, task); + return failed; } @@ -365,6 +368,7 @@ static int wake_thread(int16_t tid) if (jl_atomic_load_relaxed(&other->sleep_check_state) == sleeping) { if (jl_atomic_cmpswap_relaxed(&other->sleep_check_state, &state, not_sleeping)) { + JL_PROBE_RT_SLEEP_CHECK_WAKE(other, state); uv_mutex_lock(&sleep_locks[tid]); uv_cond_signal(&wake_signals[tid]); uv_mutex_unlock(&sleep_locks[tid]); @@ -394,8 +398,10 @@ JL_DLLEXPORT void jl_wakeup_thread(int16_t tid) if (tid == self || tid == -1) { // we're already awake, but make sure we'll exit uv_run jl_ptls_t ptls = ct->ptls; - if (jl_atomic_load_relaxed(&ptls->sleep_check_state) == sleeping) + if (jl_atomic_load_relaxed(&ptls->sleep_check_state) == sleeping) { jl_atomic_store_relaxed(&ptls->sleep_check_state, not_sleeping); + JL_PROBE_RT_SLEEP_CHECK_WAKEUP(ptls); + } if (uvlock == ct) uv_stop(jl_global_event_loop()); } @@ -444,7 +450,10 @@ static jl_task_t *get_next_task(jl_value_t *trypoptask, jl_value_t *q) jl_set_task_tid(task, self); return task; } - return multiq_deletemin(); + task = multiq_deletemin(); + if (task) + JL_PROBE_RT_TASKQ_GET(jl_current_task->ptls, task); + return task; } static int may_sleep(jl_ptls_t ptls) JL_NOTSAFEPOINT @@ -482,24 +491,31 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q) // acquire sleep-check lock jl_atomic_store_relaxed(&ptls->sleep_check_state, sleeping); jl_fence(); // [^store_buffering_1] + JL_PROBE_RT_SLEEP_CHECK_SLEEP(ptls); if (!multiq_check_empty()) { // uses relaxed loads - if (jl_atomic_load_relaxed(&ptls->sleep_check_state) != not_sleeping) + if (jl_atomic_load_relaxed(&ptls->sleep_check_state) != not_sleeping) { jl_atomic_store_relaxed(&ptls->sleep_check_state, not_sleeping); // let other threads know they don't need to wake us + JL_PROBE_RT_SLEEP_CHECK_TASKQ_WAKE(ptls); + } continue; } task = get_next_task(trypoptask, q); // note: this should not yield if (ptls != ct->ptls) { // sigh, a yield was detected, so let's go ahead and handle it anyway by starting over ptls = ct->ptls; - if (jl_atomic_load_relaxed(&ptls->sleep_check_state) != not_sleeping) + if (jl_atomic_load_relaxed(&ptls->sleep_check_state) != not_sleeping) { jl_atomic_store_relaxed(&ptls->sleep_check_state, not_sleeping); // let other threads know they don't need to wake us + JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls); + } if (task) return task; continue; } if (task) { - if (jl_atomic_load_relaxed(&ptls->sleep_check_state) != not_sleeping) + if (jl_atomic_load_relaxed(&ptls->sleep_check_state) != not_sleeping) { jl_atomic_store_relaxed(&ptls->sleep_check_state, not_sleeping); // let other threads know they don't need to wake us + JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls); + } return task; } @@ -554,8 +570,10 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q) if (!jl_atomic_load_relaxed(&_threadedregion) && active && ptls->tid == 0) { // thread 0 is the only thread permitted to run the event loop // so it needs to stay alive, just spin-looping if necessary - if (jl_atomic_load_relaxed(&ptls->sleep_check_state) != not_sleeping) + if (jl_atomic_load_relaxed(&ptls->sleep_check_state) != not_sleeping) { jl_atomic_store_relaxed(&ptls->sleep_check_state, not_sleeping); // let other threads know they don't need to wake us + JL_PROBE_RT_SLEEP_CHECK_UV_WAKE(ptls); + } start_cycles = 0; continue; } diff --git a/src/task.c b/src/task.c index 9f55b9b64a8338..1dd4e76b8ba1ce 100644 --- a/src/task.c +++ b/src/task.c @@ -229,6 +229,7 @@ static _Atomic(jl_function_t*) task_done_hook_func JL_GLOBALLY_ROOTED = NULL; void JL_NORETURN jl_finish_task(jl_task_t *t) { jl_task_t *ct = jl_current_task; + JL_PROBE_RT_FINISH_TASK(ct); JL_SIGATOMIC_BEGIN(); if (jl_atomic_load_relaxed(&t->_isexception)) jl_atomic_store_release(&t->_state, JL_TASK_STATE_FAILED); @@ -532,6 +533,8 @@ JL_DLLEXPORT void jl_switch(void) if (!jl_set_task_tid(t, jl_atomic_load_relaxed(&ct->tid))) // manually yielding to a task jl_error("cannot switch to task running on another thread"); + JL_PROBE_RT_PAUSE_TASK(ct); + // Store old values on the stack and reset sig_atomic_t defer_signal = ptls->defer_signal; int8_t gc_state = jl_gc_unsafe_enter(ptls); @@ -579,6 +582,8 @@ JL_DLLEXPORT void jl_switch(void) ptls->defer_signal = defer_signal; if (other_defer_signal && !defer_signal) jl_sigint_safepoint(ptls); + + JL_PROBE_RT_RUN_TASK(ct); } JL_DLLEXPORT void jl_switchto(jl_task_t **pt) @@ -753,6 +758,7 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion { jl_task_t *ct = jl_current_task; jl_task_t *t = (jl_task_t*)jl_gc_alloc(ct->ptls, sizeof(jl_task_t), jl_task_type); + JL_PROBE_RT_NEW_TASK(ct, t); t->copy_stack = 0; if (ssize == 0) { // stack size unspecified; use default @@ -908,6 +914,7 @@ CFI_NORETURN #endif ct->started = 1; + JL_PROBE_RT_START_TASK(ct); if (jl_atomic_load_relaxed(&ct->_isexception)) { record_backtrace(ptls, 0); jl_push_excstack(&ct->excstack, ct->result, diff --git a/src/uprobes.d b/src/uprobes.d index a583daf8c58f40..29dc39598c1da6 100644 --- a/src/uprobes.d +++ b/src/uprobes.d @@ -9,6 +9,22 @@ provider julia { probe gc__sweep__end(); probe gc__end(); probe gc__finalizer(); + + probe rt__run__task(jl_task_t *task); + probe rt__pause__task(jl_task_t *task); + probe rt__new__task(jl_task_t *parent, jl_task_t *child); + probe rt__start__task(jl_task_t *task); + probe rt__finish__task(jl_task_t *task); + probe rt__start__process__events(jl_task_t *task); + probe rt__finish__process__events(jl_task_t *task); + probe rt__taskq__insert(jl_ptls_t ptls, jl_task_t *task); + probe rt__taskq__get(jl_ptls_t ptls, jl_task_t *task); + probe rt__sleep__check__wake(jl_ptls_t other, int8_t old_state); + probe rt__sleep__check__wakeup(jl_ptls_t ptls); + probe rt__sleep__check__sleep(jl_ptls_t ptls); + probe rt__sleep__check__taskq__wake(jl_ptls_t ptls); + probe rt__sleep__check__task__wake(jl_ptls_t ptls); + probe rt__sleep__check__uv__wake(jl_ptls_t ptls); }; #pragma D attributes Evolving/Evolving/Common provider julia provider