From 9d7cfa84e27c61a23d870e292e33741b0896c4f2 Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Thu, 9 Mar 2023 17:38:33 -0500 Subject: [PATCH 01/12] Add `JL_TIMING` support for Tracy --- Make.inc | 4 ++-- deps/libtracyclient.mk | 1 + src/gc.c | 25 +++++++++++++++++++++++++ src/options.h | 2 +- src/task.c | 16 ++++++++++++++-- src/timing.h | 24 ++++++++++++++++++++++++ 6 files changed, 67 insertions(+), 5 deletions(-) diff --git a/Make.inc b/Make.inc index 1285de0dc3590..4d91cd9df9525 100644 --- a/Make.inc +++ b/Make.inc @@ -738,8 +738,8 @@ LIBITTAPI:=-littnotify endif ifeq ($(WITH_TRACY), 1) -JCXXFLAGS += -DUSE_TRACY -DTRACY_ENABLE -JCFLAGS += -DUSE_TRACY -DTRACY_ENABLE +JCXXFLAGS += -DUSE_TRACY -DTRACY_ENABLE -DTRACY_FIBERS +JCFLAGS += -DUSE_TRACY -DTRACY_ENABLE -DTRACY_FIBERS LIBTRACYCLIENT:=-lTracyClient endif diff --git a/deps/libtracyclient.mk b/deps/libtracyclient.mk index 29427889c1d6a..aee5d6e969770 100644 --- a/deps/libtracyclient.mk +++ b/deps/libtracyclient.mk @@ -11,6 +11,7 @@ LIBTRACYCLIENT_CMAKE := LIBTRACYCLIENT_CMAKE += -DBUILD_SHARED_LIBS=ON LIBTRACYCLIENT_CMAKE += -DTRACY_FIBERS=ON LIBTRACYCLIENT_CMAKE += -DTRACY_NO_BROADCAST=ON +LIBTRACYCLIENT_CMAKE += -DTRACY_NO_SYSTEM_TRACING=ON LIBTRACYCLIENT_CMAKE += -DTRACY_ONLY_LOCALHOST=ON LIBTRACYCLIENT_CMAKE += -DTRACY_NO_CODE_TRANSFER=ON LIBTRACYCLIENT_CMAKE += -DTRACY_NO_FRAME_IMAGE=ON diff --git a/src/gc.c b/src/gc.c index 158a03be017fa..fc7c89cd03b66 100644 --- a/src/gc.c +++ b/src/gc.c @@ -3243,6 +3243,21 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING); return; } +#ifdef USE_TRACY + static uint8_t first_time = 1; + if (first_time) { + first_time = 0; + TracyCFiberEnter("Main"); + } + TracyCFiberLeave; + TracyCFiberEnter("GC"); + { + int64_t tb; + jl_gc_get_total_bytes(&tb); + TracyCPlot("Heap size", ((double)tb) / (1024.0 * 1024.0)); + } +#endif +{ JL_TIMING(GC); int last_errno = errno; #ifdef _OS_WINDOWS_ @@ -3307,6 +3322,16 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) #endif errno = last_errno; } +#ifdef USE_TRACY + { + int64_t tb; + jl_gc_get_total_bytes(&tb); + TracyCPlot("Heap size", ((double)tb) / (1024.0 * 1024.0)); + } + TracyCFiberLeave; + TracyCFiberEnter("Main"); +#endif +} void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq) { diff --git a/src/options.h b/src/options.h index 82b71431ecea0..12822b6ecf8b0 100644 --- a/src/options.h +++ b/src/options.h @@ -79,7 +79,7 @@ // #define OBJPROFILE // Automatic Instrumenting Profiler -//#define ENABLE_TIMINGS +#define ENABLE_TIMINGS // method dispatch profiling -------------------------------------------------- diff --git a/src/task.c b/src/task.c index 580ad444d8cad..1035d54f2ce9a 100644 --- a/src/task.c +++ b/src/task.c @@ -1211,6 +1211,9 @@ CFI_NORETURN sanitizer_finish_switch_fiber(ptls->previous_task, ct); _start_task(); } + +const char* fiber = "task"; + STATIC_OR_JS void NOINLINE JL_NORETURN _start_task(void) { CFI_NORETURN @@ -1246,8 +1249,14 @@ CFI_NORETURN ptls->defer_signal = 0; jl_sigint_safepoint(ptls); } - JL_TIMING(ROOT); - res = jl_apply(&ct->start, 1); +//#ifdef USE_TRACY + //TracyFiberEnter(fiber); +//#endif + { + // TODO: Re-enable + //JL_TIMING(ROOT); + res = jl_apply(&ct->start, 1); + } } JL_CATCH { res = jl_current_exception(); @@ -1256,6 +1265,9 @@ CFI_NORETURN } skip_pop_exception:; } +//#ifdef USE_TRACY + //TracyFiberLeave(fiber); +//#endif ct->result = res; jl_gc_wb(ct, ct->result); jl_finish_task(ct); diff --git a/src/timing.h b/src/timing.h index 70f34fa89f543..c9a1fd1fb4807 100644 --- a/src/timing.h +++ b/src/timing.h @@ -17,6 +17,9 @@ void jl_destroy_timing(void) JL_NOTSAFEPOINT; #else #include "julia_assert.h" +#ifdef USE_TRACY +#include "tracy/TracyC.h" +#endif #ifdef __cplusplus extern "C" { @@ -78,6 +81,9 @@ extern uint64_t jl_timing_data[(int)JL_TIMING_LAST]; extern const char *jl_timing_names[(int)JL_TIMING_LAST]; struct _jl_timing_block_t { // typedef in julia.h +#ifdef USE_TRACY + TracyCZoneCtx *tracy_ctx; +#endif jl_timing_block_t *prev; uint64_t total; uint64_t t0; @@ -125,6 +131,9 @@ STATIC_INLINE void _jl_timing_block_ctor(jl_timing_block_t *block, int owner) JL } STATIC_INLINE void _jl_timing_block_destroy(jl_timing_block_t *block) JL_NOTSAFEPOINT { +#ifdef USE_TRACY + TracyCZoneEnd(*(block->tracy_ctx)); +#endif uint64_t t = cycleclock(); jl_task_t *ct = jl_current_task; _jl_timing_block_stop(block, t); @@ -150,13 +159,28 @@ struct jl_timing_block_cpp_t { jl_timing_block_cpp_t& operator=(const jl_timing_block_cpp_t &) = delete; jl_timing_block_cpp_t& operator=(const jl_timing_block_cpp_t &&) = delete; }; +#ifdef USE_TRACY +#define JL_TIMING(owner) jl_timing_block_cpp_t __timing_block(JL_TIMING_ ## owner); \ + TracyCZoneN(__tracy_ctx, #owner, strcmp(#owner, "ROOT")); \ + __timing_block.block.tracy_ctx = &__tracy_ctx; +#else #define JL_TIMING(owner) jl_timing_block_cpp_t __timing_block(JL_TIMING_ ## owner) +#endif +#else +#ifdef USE_TRACY +#define JL_TIMING(owner) \ + __attribute__((cleanup(_jl_timing_block_destroy))) \ + jl_timing_block_t __timing_block; \ + _jl_timing_block_ctor(&__timing_block, JL_TIMING_ ## owner); \ + TracyCZoneN(__tracy_ctx, #owner, 1); \ + __timing_block.tracy_ctx = &__tracy_ctx; #else #define JL_TIMING(owner) \ __attribute__((cleanup(_jl_timing_block_destroy))) \ jl_timing_block_t __timing_block; \ _jl_timing_block_ctor(&__timing_block, JL_TIMING_ ## owner) #endif +#endif #endif #endif From 7a0441c41372f6f9d7242b8a75789e7720296a4d Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Thu, 23 Mar 2023 10:48:18 -0400 Subject: [PATCH 02/12] ios: Add fixed-size streams and expand inline storage This includes a few separate enhancements: 1. `growable` is added to allow for fixed-size buffers 2. `ios_t` struct layout is optimized for better packing 3. inline buffer is increased to 83 bytes from 54 This change grows the size of ios_t by 16 bytes (160 -> 172 bytes), but the internal buffer was increased by 29 bytes thanks to re-arranging the fields in the struct to be packed more efficiently. --- src/support/ios.c | 4 ++++ src/support/ios.h | 21 ++++++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/support/ios.c b/src/support/ios.c index 4a6aeb54a4d32..2b3966eca7897 100644 --- a/src/support/ios.c +++ b/src/support/ios.c @@ -196,6 +196,9 @@ static char *_buf_realloc(ios_t *s, size_t sz) if (sz <= s->maxsize) return s->buf; + if (!s->growable) + return NULL; + if (s->ownbuf && s->buf != &s->local[0]) { // if we own the buffer we're free to resize it temp = (char*)LLT_REALLOC(s->buf, sz); @@ -892,6 +895,7 @@ static void _ios_init(ios_t *s) s->readable = 1; s->writable = 1; s->rereadable = 0; + s->growable = 1; } /* stream object initializers. we do no allocation. */ diff --git a/src/support/ios.h b/src/support/ios.h index 046edfae0556f..2547555b5585d 100644 --- a/src/support/ios.h +++ b/src/support/ios.h @@ -19,13 +19,13 @@ extern "C" { typedef enum { bm_none=1000, bm_line, bm_block, bm_mem } bufmode_t; typedef enum { bst_none, bst_rd, bst_wr } bufstate_t; -#define IOS_INLSIZE 54 +#define IOS_INLSIZE 83 #define IOS_BUFSIZE 32768 #ifdef _P64 -#define ON_P64(x) x +#define IF_P64(x,y) x #else -#define ON_P64(x) +#define IF_P64(x,y) y #endif // We allow ios_t as a cvalue in flisp, which only guarantees pointer @@ -36,10 +36,8 @@ JL_ATTRIBUTE_ALIGN_PTRSIZE(typedef struct { // in general, you can do any operation in any state. char *buf; // start of buffer - int errcode; - - ON_P64(int _pad_bm;) // put bm at same offset as type field of uv_stream_s - bufmode_t bm; // + IF_P64(int64_t userdata;, int errcode;) + bufmode_t bm; // bm must be at same offset as type field of uv_stream_s bufstate_t state; int64_t maxsize; // space allocated to buffer @@ -51,6 +49,8 @@ JL_ATTRIBUTE_ALIGN_PTRSIZE(typedef struct { size_t lineno; // current line number size_t u_colno; // current column number (in Unicode charwidths) + IF_P64(int errcode;, int64_t userdata;) + // pointer-size integer to support platforms where it might have // to be a pointer long fd; @@ -74,11 +74,14 @@ JL_ATTRIBUTE_ALIGN_PTRSIZE(typedef struct { // request durable writes (fsync) // unsigned char durable:1; - int64_t userdata; + // this declares that the buffer should not be (re-)alloc'd when + // attempting to write beyond its current maxsize. + unsigned char growable:1; + char local[IOS_INLSIZE]; } ios_t); -#undef ON_P64 +#undef IF_P64 extern void (*ios_set_io_wait_func)(int); /* low-level interface functions */ From 5aa8c1512f7f44447a56ebc706d965c4d5569848 Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Thu, 23 Mar 2023 11:34:13 -0400 Subject: [PATCH 03/12] profiling: Add Task/GC/Inference/Codegen signposts for Tracy This adds a variety of metadata to send to Tracy during profiling, in particular method instance signatures, the Heap size, and a couple of other small pieces of relevant metadata. It also adds full support for Task execution, tracking these as "Fibers" on the Tracy side. This does incur some overhead until an enhancement is implemented on the Tracy-side to avoid a globally synchronous Fiber event queue, but most of the events that we're tracking now are relatively coarse so it should not be a major issue. --- src/aotcompile.cpp | 2 +- src/ast.c | 15 ++-- src/codegen.cpp | 8 +- src/gc.c | 204 +++++++++++++++++++++-------------------- src/gf.c | 14 +-- src/ircode.c | 4 +- src/jitlayers.cpp | 8 +- src/jltypes.c | 6 +- src/julia.h | 4 + src/method.c | 3 +- src/rtutils.c | 12 ++- src/safepoint.c | 4 +- src/staticdata.c | 4 +- src/task.c | 64 +++++++------ src/timing.c | 166 ++++++++++++++++++++++++++++++++-- src/timing.h | 219 ++++++++++++++++++++++++++++++++++----------- src/toplevel.c | 3 +- 17 files changed, 521 insertions(+), 219 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 63d941949343e..fc1d4074e92bb 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -1448,7 +1448,7 @@ void jl_dump_native_impl(void *native_code, const char *asm_fname, const char *sysimg_data, size_t sysimg_len, ios_t *s) { - JL_TIMING(NATIVE_DUMP); + JL_TIMING(NATIVE_DUMP, NATIVE_DUMP); jl_native_code_desc_t *data = (jl_native_code_desc_t*)native_code; if (!bc_fname && !unopt_bc_fname && !obj_fname && !asm_fname) { LLVM_DEBUG(dbgs() << "No output requested, skipping native code dump?\n"); diff --git a/src/ast.c b/src/ast.c index 3f3d6176d342e..08c493c75b985 100644 --- a/src/ast.c +++ b/src/ast.c @@ -783,7 +783,8 @@ JL_DLLEXPORT jl_value_t *jl_fl_parse(const char *text, size_t text_len, jl_value_t *filename, size_t lineno, size_t offset, jl_value_t *options) { - JL_TIMING(PARSING); + JL_TIMING(PARSING, PARSING); + jl_timing_show_filename(jl_string_data(filename), JL_TIMING_CURRENT_BLOCK); if (offset > text_len) { jl_value_t *textstr = jl_pchar_to_string(text, text_len); JL_GC_PUSH1(&textstr); @@ -1000,7 +1001,7 @@ int jl_has_meta(jl_array_t *body, jl_sym_t *sym) JL_NOTSAFEPOINT static jl_value_t *jl_invoke_julia_macro(jl_array_t *args, jl_module_t *inmodule, jl_module_t **ctx, size_t world, int throw_load_error) { jl_task_t *ct = jl_current_task; - JL_TIMING(MACRO_INVOCATION); + JL_TIMING(MACRO_INVOCATION, MACRO_INVOCATION); size_t nargs = jl_array_len(args) + 1; JL_NARGSV("macrocall", 3); // macro name, location, and module jl_value_t **margs; @@ -1139,7 +1140,7 @@ static jl_value_t *jl_expand_macros(jl_value_t *expr, jl_module_t *inmodule, str JL_DLLEXPORT jl_value_t *jl_macroexpand(jl_value_t *expr, jl_module_t *inmodule) { - JL_TIMING(LOWERING); + JL_TIMING(LOWERING, LOWERING); JL_GC_PUSH1(&expr); expr = jl_copy_ast(expr); expr = jl_expand_macros(expr, inmodule, NULL, 0, jl_atomic_load_acquire(&jl_world_counter), 0); @@ -1150,7 +1151,7 @@ JL_DLLEXPORT jl_value_t *jl_macroexpand(jl_value_t *expr, jl_module_t *inmodule) JL_DLLEXPORT jl_value_t *jl_macroexpand1(jl_value_t *expr, jl_module_t *inmodule) { - JL_TIMING(LOWERING); + JL_TIMING(LOWERING, LOWERING); JL_GC_PUSH1(&expr); expr = jl_copy_ast(expr); expr = jl_expand_macros(expr, inmodule, NULL, 1, jl_atomic_load_acquire(&jl_world_counter), 0); @@ -1176,7 +1177,7 @@ JL_DLLEXPORT jl_value_t *jl_expand_with_loc(jl_value_t *expr, jl_module_t *inmod JL_DLLEXPORT jl_value_t *jl_expand_in_world(jl_value_t *expr, jl_module_t *inmodule, const char *file, int line, size_t world) { - JL_TIMING(LOWERING); + JL_TIMING(LOWERING, LOWERING); JL_GC_PUSH1(&expr); expr = jl_copy_ast(expr); expr = jl_expand_macros(expr, inmodule, NULL, 0, world, 1); @@ -1189,7 +1190,7 @@ JL_DLLEXPORT jl_value_t *jl_expand_in_world(jl_value_t *expr, jl_module_t *inmod JL_DLLEXPORT jl_value_t *jl_expand_with_loc_warn(jl_value_t *expr, jl_module_t *inmodule, const char *file, int line) { - JL_TIMING(LOWERING); + JL_TIMING(LOWERING, LOWERING); jl_array_t *kwargs = NULL; JL_GC_PUSH2(&expr, &kwargs); expr = jl_copy_ast(expr); @@ -1237,7 +1238,7 @@ JL_DLLEXPORT jl_value_t *jl_expand_with_loc_warn(jl_value_t *expr, jl_module_t * JL_DLLEXPORT jl_value_t *jl_expand_stmt_with_loc(jl_value_t *expr, jl_module_t *inmodule, const char *file, int line) { - JL_TIMING(LOWERING); + JL_TIMING(LOWERING, LOWERING); JL_GC_PUSH1(&expr); expr = jl_copy_ast(expr); expr = jl_expand_macros(expr, inmodule, NULL, 0, ~(size_t)0, 1); diff --git a/src/codegen.cpp b/src/codegen.cpp index 68fafba7a5a06..2d75d4ca8ebf3 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -8421,7 +8421,8 @@ jl_llvm_functions_t jl_emit_code( jl_value_t *jlrettype, jl_codegen_params_t ¶ms) { - JL_TIMING(CODEGEN); + JL_TIMING(CODEGEN, CODEGEN); + jl_timing_show_func_sig((jl_value_t *)li->specTypes, JL_TIMING_CURRENT_BLOCK); // caller must hold codegen_lock jl_llvm_functions_t decls = {}; assert((params.params == &jl_default_cgparams /* fast path */ || !params.cache || @@ -8463,7 +8464,8 @@ jl_llvm_functions_t jl_emit_codeinst( jl_code_info_t *src, jl_codegen_params_t ¶ms) { - JL_TIMING(CODEGEN); + JL_TIMING(CODEGEN, CODEGEN); + jl_timing_show_method_instance(codeinst->def, JL_TIMING_CURRENT_BLOCK); JL_GC_PUSH1(&src); if (!src) { src = (jl_code_info_t*)jl_atomic_load_relaxed(&codeinst->inferred); @@ -8542,7 +8544,7 @@ void jl_compile_workqueue( Module &original, jl_codegen_params_t ¶ms, CompilationPolicy policy) { - JL_TIMING(CODEGEN); + JL_TIMING(CODEGEN, CODEGEN); jl_code_info_t *src = NULL; JL_GC_PUSH1(&src); while (!params.workqueue.empty()) { diff --git a/src/gc.c b/src/gc.c index fc7c89cd03b66..a67783c741b43 100644 --- a/src/gc.c +++ b/src/gc.c @@ -311,6 +311,11 @@ NOINLINE uintptr_t gc_get_stack_ptr(void) void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads) { + JL_TIMING(GC, Stop); +#ifdef USE_TRACY + TracyCZoneCtx ctx = *(JL_TIMING_CURRENT_BLOCK->tracy_ctx); + TracyCZoneColor(ctx, 0x696969); +#endif assert(gc_n_threads); if (gc_n_threads > 1) jl_wake_libuv(); @@ -2941,79 +2946,86 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) { combine_thread_gc_counts(&gc_num); +#ifdef USE_TRACY + TracyCPlot("Heap size", live_bytes + gc_num.allocd); +#endif + jl_gc_markqueue_t *mq = &ptls->mark_queue; uint64_t gc_start_time = jl_hrtime(); int64_t last_perm_scanned_bytes = perm_scanned_bytes; - JL_PROBE_GC_MARK_BEGIN(); uint64_t start_mark_time = jl_hrtime(); - - // 1. fix GC bits of objects in the remset. - assert(gc_n_threads); - for (int t_i = 0; t_i < gc_n_threads; t_i++) { - jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2 != NULL) - gc_premark(ptls2); - } - - assert(gc_n_threads); - for (int t_i = 0; t_i < gc_n_threads; t_i++) { - jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2 != NULL) { - // 2.1. mark every thread local root - gc_queue_thread_local(mq, ptls2); - // 2.2. mark any managed objects in the backtrace buffer - // TODO: treat these as roots for gc_heap_snapshot_record - gc_queue_bt_buf(mq, ptls2); - // 2.3. mark every object in the `last_remsets` and `rem_binding` - gc_queue_remset(ptls, ptls2); + JL_PROBE_GC_MARK_BEGIN(); + { + JL_TIMING(GC, Mark); + + // 1. fix GC bits of objects in the remset. + assert(gc_n_threads); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 != NULL) + gc_premark(ptls2); + } + + assert(gc_n_threads); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 != NULL) { + // 2.1. mark every thread local root + gc_queue_thread_local(mq, ptls2); + // 2.2. mark any managed objects in the backtrace buffer + // TODO: treat these as roots for gc_heap_snapshot_record + gc_queue_bt_buf(mq, ptls2); + // 2.3. mark every object in the `last_remsets` and `rem_binding` + gc_queue_remset(ptls, ptls2); + } } - } - // 3. walk roots - gc_mark_roots(mq); - if (gc_cblist_root_scanner) { - gc_invoke_callbacks(jl_gc_cb_root_scanner_t, - gc_cblist_root_scanner, (collection)); - } - gc_mark_loop(ptls); - - // 4. check for objects to finalize - clear_weak_refs(); - // Record the length of the marked list since we need to - // mark the object moved to the marked list from the - // `finalizer_list` by `sweep_finalizer_list` - size_t orig_marked_len = finalizer_list_marked.len; - assert(gc_n_threads); - for (int i = 0; i < gc_n_threads; i++) { - jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 != NULL) - sweep_finalizer_list(&ptls2->finalizers); + // 3. walk roots + gc_mark_roots(mq); + if (gc_cblist_root_scanner) { + gc_invoke_callbacks(jl_gc_cb_root_scanner_t, + gc_cblist_root_scanner, (collection)); + } + gc_mark_loop(ptls); + + // 4. check for objects to finalize + clear_weak_refs(); + // Record the length of the marked list since we need to + // mark the object moved to the marked list from the + // `finalizer_list` by `sweep_finalizer_list` + size_t orig_marked_len = finalizer_list_marked.len; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 != NULL) + sweep_finalizer_list(&ptls2->finalizers); + } + if (prev_sweep_full) { + sweep_finalizer_list(&finalizer_list_marked); + orig_marked_len = 0; + } + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 != NULL) + gc_mark_finlist(mq, &ptls2->finalizers, 0); + } + gc_mark_finlist(mq, &finalizer_list_marked, orig_marked_len); + // "Flush" the mark stack before flipping the reset_age bit + // so that the objects are not incorrectly reset. + gc_mark_loop(ptls); + // Conservative marking relies on age to tell allocated objects + // and freelist entries apart. + mark_reset_age = !jl_gc_conservative_gc_support_enabled(); + // Reset the age and old bit for any unmarked objects referenced by the + // `to_finalize` list. These objects are only reachable from this list + // and should not be referenced by any old objects so this won't break + // the GC invariant. + gc_mark_finlist(mq, &to_finalize, 0); + gc_mark_loop(ptls); + mark_reset_age = 0; } - if (prev_sweep_full) { - sweep_finalizer_list(&finalizer_list_marked); - orig_marked_len = 0; - } - assert(gc_n_threads); - for (int i = 0; i < gc_n_threads; i++) { - jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 != NULL) - gc_mark_finlist(mq, &ptls2->finalizers, 0); - } - gc_mark_finlist(mq, &finalizer_list_marked, orig_marked_len); - // "Flush" the mark stack before flipping the reset_age bit - // so that the objects are not incorrectly reset. - gc_mark_loop(ptls); - // Conservative marking relies on age to tell allocated objects - // and freelist entries apart. - mark_reset_age = !jl_gc_conservative_gc_support_enabled(); - // Reset the age and old bit for any unmarked objects referenced by the - // `to_finalize` list. These objects are only reachable from this list - // and should not be referenced by any old objects so this won't break - // the GC invariant. - gc_mark_finlist(mq, &to_finalize, 0); - gc_mark_loop(ptls); - mark_reset_age = 0; gc_num.since_sweep += gc_num.allocd; JL_PROBE_GC_MARK_END(scanned_bytes, perm_scanned_bytes); @@ -3081,7 +3093,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) } } - // If the live data outgrows the suggested max_total_memory // we keep going with minimum intervals and full gcs until // we either free some space or get an OOM error. @@ -3106,15 +3117,24 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // 6. start sweeping uint64_t start_sweep_time = jl_hrtime(); JL_PROBE_GC_SWEEP_BEGIN(sweep_full); - sweep_weak_refs(); - sweep_stack_pools(); - gc_sweep_foreign_objs(); - gc_sweep_other(ptls, sweep_full); - gc_scrub(); - gc_verify_tags(); - gc_sweep_pool(sweep_full); - if (sweep_full) - gc_sweep_perm_alloc(); + { + JL_TIMING(GC, Sweep); +#ifdef USE_TRACY + if (sweep_full) { + TracyCZoneCtx ctx = *(JL_TIMING_CURRENT_BLOCK->tracy_ctx); + TracyCZoneColor(ctx, 0xFFA500); + } +#endif + sweep_weak_refs(); + sweep_stack_pools(); + gc_sweep_foreign_objs(); + gc_sweep_other(ptls, sweep_full); + gc_scrub(); + gc_verify_tags(); + gc_sweep_pool(sweep_full); + if (sweep_full) + gc_sweep_perm_alloc(); + } JL_PROBE_GC_SWEEP_END(); uint64_t gc_end_time = jl_hrtime(); @@ -3243,22 +3263,10 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING); return; } -#ifdef USE_TRACY - static uint8_t first_time = 1; - if (first_time) { - first_time = 0; - TracyCFiberEnter("Main"); - } - TracyCFiberLeave; - TracyCFiberEnter("GC"); - { - int64_t tb; - jl_gc_get_total_bytes(&tb); - TracyCPlot("Heap size", ((double)tb) / (1024.0 * 1024.0)); - } -#endif -{ - JL_TIMING(GC); + + JL_TIMING_SUSPEND(GC, ct); + JL_TIMING(GC, GC); + int last_errno = errno; #ifdef _OS_WINDOWS_ DWORD last_error = GetLastError(); @@ -3311,6 +3319,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) // Doing this on all threads is racy (it's impossible to check // or wait for finalizers on other threads without dead lock). if (!ptls->finalizers_inhibited && ptls->locks.len == 0) { + JL_TIMING(GC, Finalizers); run_finalizers(ct); } JL_PROBE_GC_FINALIZER(); @@ -3321,15 +3330,9 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) SetLastError(last_error); #endif errno = last_errno; -} + #ifdef USE_TRACY - { - int64_t tb; - jl_gc_get_total_bytes(&tb); - TracyCPlot("Heap size", ((double)tb) / (1024.0 * 1024.0)); - } - TracyCFiberLeave; - TracyCFiberEnter("Main"); + TracyCPlot("Heap size", jl_gc_live_bytes()); #endif } @@ -3432,6 +3435,9 @@ void jl_gc_init(void) if (jl_options.heap_size_hint) jl_gc_set_max_memory(jl_options.heap_size_hint); +#ifdef USE_TRACY + TracyCPlotConfig("Heap size", TracyPlotFormatMemory, /* rectilinear */ 0, /* fill */ 1, /* color */ 0); +#endif t_start = jl_hrtime(); } diff --git a/src/gf.c b/src/gf.c index 0b01f5e8e6ee2..57a81deda2d36 100644 --- a/src/gf.c +++ b/src/gf.c @@ -314,7 +314,7 @@ jl_datatype_t *jl_mk_builtin_func(jl_datatype_t *dt, const char *name, jl_fptr_a // if inference doesn't occur (or can't finish), returns NULL instead jl_code_info_t *jl_type_infer(jl_method_instance_t *mi, size_t world, int force) { - JL_TIMING(INFERENCE); + JL_TIMING(INFERENCE, INFERENCE); if (jl_typeinf_func == NULL) return NULL; jl_task_t *ct = jl_current_task; @@ -337,6 +337,8 @@ jl_code_info_t *jl_type_infer(jl_method_instance_t *mi, size_t world, int force) fargs[0] = (jl_value_t*)jl_typeinf_func; fargs[1] = (jl_value_t*)mi; fargs[2] = jl_box_ulong(world); + + jl_timing_show_method_instance(mi, JL_TIMING_CURRENT_BLOCK); #ifdef TRACE_INFERENCE if (mi->specTypes != (jl_value_t*)jl_emptytuple_type) { jl_printf(JL_STDERR,"inference on "); @@ -391,6 +393,7 @@ jl_code_info_t *jl_type_infer(jl_method_instance_t *mi, size_t world, int force) } JL_GC_POP(); #endif + return src; } @@ -1923,9 +1926,10 @@ static int is_replacing(jl_value_t *type, jl_method_t *m, jl_method_t *const *d, JL_DLLEXPORT void jl_method_table_insert(jl_methtable_t *mt, jl_method_t *method, jl_tupletype_t *simpletype) { - JL_TIMING(ADD_METHOD); + JL_TIMING(ADD_METHOD, ADD_METHOD); assert(jl_is_method(method)); assert(jl_is_mtable(mt)); + jl_timing_show((jl_value_t *)method, JL_TIMING_CURRENT_BLOCK); jl_value_t *type = method->sig; jl_value_t *oldvalue = NULL; jl_array_t *oldmi = NULL; @@ -2205,7 +2209,7 @@ jl_method_instance_t *jl_method_lookup(jl_value_t **args, size_t nargs, size_t w JL_DLLEXPORT jl_value_t *jl_matching_methods(jl_tupletype_t *types, jl_value_t *mt, int lim, int include_ambiguous, size_t world, size_t *min_valid, size_t *max_valid, int *ambig) { - JL_TIMING(METHOD_MATCH); + JL_TIMING(METHOD_MATCH, METHOD_MATCH); if (ambig != NULL) *ambig = 0; jl_value_t *unw = jl_unwrap_unionall((jl_value_t*)types); @@ -2930,7 +2934,7 @@ STATIC_INLINE jl_method_instance_t *jl_lookup_generic_(jl_value_t *F, jl_value_t int64_t last_alloc; if (i == 4) { // if no method was found in the associative cache, check the full cache - JL_TIMING(METHOD_LOOKUP_FAST); + JL_TIMING(METHOD_LOOKUP_FAST, METHOD_LOOKUP_FAST); mt = jl_gf_mtable(F); jl_array_t *leafcache = jl_atomic_load_relaxed(&mt->leafcache); entry = NULL; @@ -2973,7 +2977,7 @@ STATIC_INLINE jl_method_instance_t *jl_lookup_generic_(jl_value_t *F, jl_value_t assert(tt); JL_LOCK(&mt->writelock); // cache miss case - JL_TIMING(METHOD_LOOKUP_SLOW); + JL_TIMING(METHOD_LOOKUP_SLOW, METHOD_LOOKUP_SLOW); mfunc = jl_mt_assoc_by_type(mt, tt, world); JL_UNLOCK(&mt->writelock); JL_GC_POP(); diff --git a/src/ircode.c b/src/ircode.c index f967dd1a29f51..71ee292cbc397 100644 --- a/src/ircode.c +++ b/src/ircode.c @@ -760,7 +760,7 @@ static jl_value_t *jl_decode_value(jl_ircode_state *s) JL_GC_DISABLED JL_DLLEXPORT jl_array_t *jl_compress_ir(jl_method_t *m, jl_code_info_t *code) { - JL_TIMING(AST_COMPRESS); + JL_TIMING(AST_COMPRESS, AST_COMPRESS); JL_LOCK(&m->writelock); // protect the roots array (Might GC) assert(jl_is_method(m)); assert(jl_is_code_info(code)); @@ -855,7 +855,7 @@ JL_DLLEXPORT jl_code_info_t *jl_uncompress_ir(jl_method_t *m, jl_code_instance_t { if (jl_is_code_info(data)) return (jl_code_info_t*)data; - JL_TIMING(AST_UNCOMPRESS); + JL_TIMING(AST_UNCOMPRESS, AST_UNCOMPRESS); JL_LOCK(&m->writelock); // protect the roots array (Might GC) assert(jl_is_method(m)); assert(jl_typeis(data, jl_array_uint8_type)); diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index c7e202b98efab..7c68b78c27850 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -192,6 +192,8 @@ static jl_callptr_t _jl_compile_codeinst( "invalid world for method-instance"); assert(src && jl_is_code_info(src)); + JL_TIMING(LLVM_MODULE_FINISH, LLVM_MODULE_FINISH); + jl_callptr_t fptr = NULL; // emit the code in LLVM IR form jl_codegen_params_t params(std::move(context), jl_ExecutionEngine->getDataLayout(), jl_ExecutionEngine->getTargetTriple()); // Locks the context @@ -245,10 +247,10 @@ static jl_callptr_t _jl_compile_codeinst( MaxWorkqueueSize.updateMax(emitted.size()); IndirectCodeinsts += emitted.size() - 1; } - JL_TIMING(LLVM_MODULE_FINISH); for (auto &def : emitted) { jl_code_instance_t *this_code = def.first; + jl_timing_show_func_sig(this_code->def->specTypes, JL_TIMING_CURRENT_BLOCK); jl_llvm_functions_t decls = std::get<1>(def.second); jl_callptr_t addr; bool isspecsig = false; @@ -1175,7 +1177,7 @@ namespace { } } - JL_TIMING(LLVM_OPT); + JL_TIMING(LLVM_OPT, LLVM_OPT); //Run the optimization assert(!verifyModule(M, &errs())); @@ -1410,7 +1412,7 @@ void JuliaOJIT::addGlobalMapping(StringRef Name, uint64_t Addr) void JuliaOJIT::addModule(orc::ThreadSafeModule TSM) { - JL_TIMING(LLVM_MODULE_FINISH); + JL_TIMING(LLVM_MODULE_FINISH, LLVM_MODULE_FINISH); ++ModulesAdded; orc::SymbolLookupSet NewExports; TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT { diff --git a/src/jltypes.c b/src/jltypes.c index 4a451e9b70e80..fe857e03817b5 100644 --- a/src/jltypes.c +++ b/src/jltypes.c @@ -732,7 +732,7 @@ static ssize_t lookup_type_idx_linearvalue(jl_svec_t *cache, jl_value_t *key1, j static jl_value_t *lookup_type(jl_typename_t *tn JL_PROPAGATES_ROOT, jl_value_t **key, size_t n) { - JL_TIMING(TYPE_CACHE_LOOKUP); + JL_TIMING(TYPE_CACHE_LOOKUP, TYPE_CACHE_LOOKUP); if (tn == jl_type_typename) { assert(n == 1); jl_value_t *uw = jl_unwrap_unionall(key[0]); @@ -753,7 +753,7 @@ static jl_value_t *lookup_type(jl_typename_t *tn JL_PROPAGATES_ROOT, jl_value_t static jl_value_t *lookup_typevalue(jl_typename_t *tn, jl_value_t *key1, jl_value_t **key, size_t n, int leaf) { - JL_TIMING(TYPE_CACHE_LOOKUP); + JL_TIMING(TYPE_CACHE_LOOKUP, TYPE_CACHE_LOOKUP); unsigned hv = typekeyvalue_hash(tn, key1, key, n, leaf); if (hv) { jl_svec_t *cache = jl_atomic_load_relaxed(&tn->cache); @@ -874,7 +874,7 @@ static int is_cacheable(jl_datatype_t *type) void jl_cache_type_(jl_datatype_t *type) { - JL_TIMING(TYPE_CACHE_INSERT); + JL_TIMING(TYPE_CACHE_INSERT, TYPE_CACHE_INSERT); assert(is_cacheable(type)); jl_value_t **key = jl_svec_data(type->parameters); int n = jl_svec_len(type->parameters); diff --git a/src/julia.h b/src/julia.h index fc8a4b8daa524..a1e0a5a46c0ce 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1934,6 +1934,10 @@ typedef struct _jl_task_t { uint16_t priority; // hidden state: + +#ifdef USE_TRACY + const char *name; +#endif // id of owning thread - does not need to be defined until the task runs _Atomic(int16_t) tid; // threadpool id diff --git a/src/method.c b/src/method.c index ee333bebccedf..de78d9ab884ed 100644 --- a/src/method.c +++ b/src/method.c @@ -564,9 +564,10 @@ JL_DLLEXPORT jl_code_info_t *jl_code_for_staged(jl_method_instance_t *linfo, siz return (jl_code_info_t*)jl_copy_ast((jl_value_t*)uninferred); } - JL_TIMING(STAGED_FUNCTION); + JL_TIMING(STAGED_FUNCTION, STAGED_FUNCTION); jl_value_t *tt = linfo->specTypes; jl_method_t *def = linfo->def.method; + jl_timing_show_method_instance(linfo, JL_TIMING_CURRENT_BLOCK); jl_value_t *generator = def->generator; assert(generator != NULL); assert(jl_is_method(def)); diff --git a/src/rtutils.c b/src/rtutils.c index dd606f38d065c..41e45cd468039 100644 --- a/src/rtutils.c +++ b/src/rtutils.c @@ -1294,7 +1294,11 @@ JL_DLLEXPORT size_t jl_static_show_func_sig(JL_STREAM *s, jl_value_t *type) JL_N for (i = 1; i < tl; i++) { jl_value_t *tp = jl_tparam(type, i); if (i != tl - 1) { - n += jl_static_show_x(s, tp, depth); + if (jl_is_datatype(tp)) { + n += jl_printf(s, "%s", jl_symbol_name(((jl_datatype_t*)tp)->name->name)); + } else { + n += jl_static_show_x(s, tp, depth); + } n += jl_printf(s, ", "); } else { @@ -1302,7 +1306,11 @@ JL_DLLEXPORT size_t jl_static_show_func_sig(JL_STREAM *s, jl_value_t *type) JL_N tp = jl_unwrap_vararg(tp); if (jl_is_unionall(tp)) n += jl_printf(s, "("); - n += jl_static_show_x(s, tp, depth); + if (jl_is_datatype(tp)) { + n += jl_printf(s, "%s", jl_symbol_name(((jl_datatype_t*)tp)->name->name)); + } else { + n += jl_static_show_x(s, tp, depth); + } if (jl_is_unionall(tp)) n += jl_printf(s, ")"); n += jl_printf(s, "..."); diff --git a/src/safepoint.c b/src/safepoint.c index 1ff26d616a5d8..19eca4bf6f00d 100644 --- a/src/safepoint.c +++ b/src/safepoint.c @@ -150,8 +150,10 @@ void jl_safepoint_end_gc(void) void jl_safepoint_wait_gc(void) { + jl_task_t *ct = jl_current_task; (void)ct; + JL_TIMING_SUSPEND(GC_SAFEPOINT, ct); // The thread should have set this is already - assert(jl_atomic_load_relaxed(&jl_current_task->ptls->gc_state) != 0); + assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) != 0); // Use normal volatile load in the loop for speed until GC finishes. // Then use an acquire load to make sure the GC result is visible on this thread. while (jl_atomic_load_relaxed(&jl_gc_running) || jl_atomic_load_acquire(&jl_gc_running)) { diff --git a/src/staticdata.c b/src/staticdata.c index d6c92ac93a851..5c6da3f200edb 100644 --- a/src/staticdata.c +++ b/src/staticdata.c @@ -2520,7 +2520,7 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli { jl_gc_collect(JL_GC_FULL); jl_gc_collect(JL_GC_INCREMENTAL); // sweep finalizers - JL_TIMING(SYSIMG_DUMP); + JL_TIMING(SYSIMG_DUMP, SYSIMG_DUMP); // iff emit_split // write header and src_text to one file f/s @@ -2672,7 +2672,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl jl_array_t **ext_targets, jl_array_t **edges, char **base, arraylist_t *ccallable_list, pkgcachesizes *cachesizes) JL_GC_DISABLED { - JL_TIMING(SYSIMG_LOAD); + JL_TIMING(SYSIMG_LOAD, SYSIMG_LOAD); int en = jl_gc_enable(0); ios_t sysimg, const_data, symbols, relocs, gvar_record, fptr_record; jl_serializer_state s; diff --git a/src/task.c b/src/task.c index 1035d54f2ce9a..e73b19563e336 100644 --- a/src/task.c +++ b/src/task.c @@ -646,13 +646,7 @@ JL_DLLEXPORT void jl_switch(void) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER int finalizers_inhibited = ptls->finalizers_inhibited; ptls->finalizers_inhibited = 0; -#ifdef ENABLE_TIMINGS - jl_timing_block_t *blk = ptls->timing_stack; - if (blk) - jl_timing_block_stop(blk); - ptls->timing_stack = NULL; -#endif - + jl_timing_block_t *blk = jl_timing_block_exit_task(ct, ptls); ctx_switch(ct); #ifdef MIGRATE_TASKS @@ -672,15 +666,7 @@ JL_DLLEXPORT void jl_switch(void) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER 0 != ct->ptls && 0 == ptls->finalizers_inhibited); ptls->finalizers_inhibited = finalizers_inhibited; - -#ifdef ENABLE_TIMINGS - assert(ptls->timing_stack == NULL); - ptls->timing_stack = blk; - if (blk) - jl_timing_block_start(blk); -#else - (void)ct; -#endif + jl_timing_block_enter_task(ct, ptls, blk); (void)blk; sig_atomic_t other_defer_signal = ptls->defer_signal; ptls->defer_signal = defer_signal; @@ -1082,6 +1068,30 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion t->start = start; t->result = jl_nothing; t->donenotify = completion_future; +#ifdef USE_TRACY + jl_value_t *start_type = jl_typeof(t->start); + const char *start_name = ""; + if (jl_is_datatype(start_type)) + start_name = jl_symbol_name(((jl_datatype_t *) start_type)->name->name); + + static uint16_t task_id = 1; + + // XXX: Tracy uses this as a handle internally and requires that this + // string live forever, so this allocation is intentionally leaked. + char *fiber_name; + if (start_name[0] == '#') { + jl_method_instance_t *mi = jl_method_lookup(&t->start, 1, jl_get_world_counter()); + size_t fiber_name_len = strlen(jl_symbol_name(mi->def.method->file)) + 22; // 22 characters in "Task 65535 (:0000000)\0" + fiber_name = (char *)malloc(fiber_name_len); + snprintf(fiber_name, fiber_name_len, "Task %d (%s:%d)", task_id++, jl_symbol_name(mi->def.method->file), mi->def.method->line); + } else { + size_t fiber_name_len = strlen(start_name) + 16; // 16 characters in "Task 65535 (\"\")\0" + fiber_name = (char *)malloc(fiber_name_len); + snprintf(fiber_name, fiber_name_len, "Task %d (\"%s\")", task_id++, start_name); + } + + t->name = fiber_name; +#endif jl_atomic_store_relaxed(&t->_isexception, 0); // Inherit logger state from parent task t->logstate = ct->logstate; @@ -1237,6 +1247,7 @@ CFI_NORETURN ct->started = 1; JL_PROBE_RT_START_TASK(ct); + jl_timing_block_enter_task(ct, ptls, NULL); if (jl_atomic_load_relaxed(&ct->_isexception)) { record_backtrace(ptls, 0); jl_push_excstack(&ct->excstack, ct->result, @@ -1249,14 +1260,8 @@ CFI_NORETURN ptls->defer_signal = 0; jl_sigint_safepoint(ptls); } -//#ifdef USE_TRACY - //TracyFiberEnter(fiber); -//#endif - { - // TODO: Re-enable - //JL_TIMING(ROOT); - res = jl_apply(&ct->start, 1); - } + JL_TIMING(ROOT, ROOT); + res = jl_apply(&ct->start, 1); } JL_CATCH { res = jl_current_exception(); @@ -1265,9 +1270,6 @@ CFI_NORETURN } skip_pop_exception:; } -//#ifdef USE_TRACY - //TracyFiberLeave(fiber); -//#endif ct->result = res; jl_gc_wb(ct, ct->result); jl_finish_task(ct); @@ -1678,6 +1680,12 @@ jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi) ct->stkbuf = stack; ct->bufsz = ssize; } + +#ifdef USE_TRACY + char *unique_string = (char *)malloc(strlen("Root") + 1); + strcpy(unique_string, "Root"); + ct->name = unique_string; +#endif ct->started = 1; ct->next = jl_nothing; ct->queue = jl_nothing; @@ -1710,6 +1718,8 @@ jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi) ct->ctx.asan_fake_stack = NULL; #endif + jl_timing_block_enter_task(ct, ptls, NULL); + #ifdef COPY_STACKS // initialize the base_ctx from which all future copy_stacks will be copies if (always_copy_stacks) { diff --git a/src/timing.c b/src/timing.c index 929a09305f993..b2a483747dc6e 100644 --- a/src/timing.c +++ b/src/timing.c @@ -18,7 +18,24 @@ extern "C" { #endif static uint64_t t0; -JL_DLLEXPORT uint64_t jl_timing_data[(int)JL_TIMING_LAST] = {0}; +#ifdef USE_TRACY +/** + * These sources often generate millions of events / minute. Although Tracy + * can generally keep up with that, those events also bloat the saved ".tracy" + * files, so we disable them by default. + **/ +JL_DLLEXPORT uint64_t jl_timing_enable_mask = 0xFFFFFFFFFFFFFFFF & + ~(1ull << JL_TIMING_ROOT) & + ~(1ull << JL_TIMING_TYPE_CACHE_LOOKUP) & + ~(1ull << JL_TIMING_METHOD_MATCH) & + ~(1ull << JL_TIMING_METHOD_LOOKUP_FAST) & + ~(1ull << JL_TIMING_AST_COMPRESS) & + ~(1ull << JL_TIMING_AST_UNCOMPRESS); +#else +JL_DLLEXPORT uint64_t jl_timing_enable_mask = 0xFFFFFFFFFFFFFFFF; +#endif + +JL_DLLEXPORT uint64_t jl_timing_counts[(int)JL_TIMING_LAST] = {0}; const char *jl_timing_names[(int)JL_TIMING_LAST] = { #define X(name) #name @@ -31,13 +48,13 @@ void jl_print_timings(void) uint64_t total_time = cycleclock() - t0; uint64_t root_time = total_time; for (int i = 0; i < JL_TIMING_LAST; i++) { - root_time -= jl_timing_data[i]; + root_time -= jl_timing_counts[i]; } - jl_timing_data[0] = root_time; + jl_timing_counts[0] = root_time; for (int i = 0; i < JL_TIMING_LAST; i++) { - if (jl_timing_data[i] != 0) + if (jl_timing_counts[i] != 0) fprintf(stderr, "%-25s : %5.2f %% %" PRIu64 "\n", jl_timing_names[i], - 100 * (((double)jl_timing_data[i]) / total_time), jl_timing_data[i]); + 100 * (((double)jl_timing_counts[i]) / total_time), jl_timing_counts[i]); } } @@ -62,20 +79,151 @@ jl_timing_block_t *jl_pop_timing_block(jl_timing_block_t *cur_block) return cur_block->prev; } -void jl_timing_block_start(jl_timing_block_t *cur_block) +void jl_timing_block_enter_task(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t *prev_blk) +{ + if (prev_blk != NULL) { + assert(ptls->timing_stack == NULL); + + ptls->timing_stack = prev_blk; + if (prev_blk != NULL) { + _COUNTS_START(&prev_blk->counts_ctx, cycleclock()); + } + } + +#ifdef USE_TRACY + TracyCFiberEnter(ct->name); +#else + (void)ct; +#endif +} + +jl_timing_block_t *jl_timing_block_exit_task(jl_task_t *ct, jl_ptls_t ptls) +{ +#ifdef USE_TRACY + // Tracy is fairly strict about not leaving a fiber that + // hasn't been entered, which happens often when + // connecting to a running Julia session. + // + // Eventually, Tracy will support telling the server that + // which fibers are active upon connection, but until then + // work around around the problem by just entering the new + // fiber directly, which implicitly leaves any active fibers. + + //TracyCFiberLeave; +#endif + (void)ct; + + jl_timing_block_t *blk = ptls->timing_stack; + ptls->timing_stack = NULL; + + if (blk != NULL) { + _COUNTS_STOP(&blk->counts_ctx, cycleclock()); + } + return blk; +} + +static inline const char *gnu_basename(const char *path) { - _jl_timing_block_start(cur_block, cycleclock()); + char *base = strrchr(path, '/'); + return base ? base+1 : path; } -void jl_timing_block_stop(jl_timing_block_t *cur_block) +JL_DLLEXPORT void jl_timing_show(jl_value_t *v, jl_timing_block_t *cur_block) { - _jl_timing_block_stop(cur_block, cycleclock()); +#ifdef USE_TRACY + ios_t buf; + ios_mem(&buf, IOS_INLSIZE); + buf.growable = 0; // Restrict to inline buffer to avoid allocation + + jl_static_show((JL_STREAM*)&buf, v); + if (buf.size == buf.maxsize) + memset(&buf.buf[IOS_INLSIZE - 3], '.', 3); + + TracyCZoneText(*(cur_block->tracy_ctx), buf.buf, buf.size); +#endif +} + +JL_DLLEXPORT void jl_timing_show_module(jl_module_t *m, jl_timing_block_t *cur_block) +{ +#ifdef USE_TRACY + const char *module_name = jl_symbol_name(m->name); + TracyCZoneText(*(cur_block->tracy_ctx), module_name, strlen(module_name)); +#endif +} + +JL_DLLEXPORT void jl_timing_show_filename(const char *path, jl_timing_block_t *cur_block) +{ +#ifdef USE_TRACY + const char *filename = gnu_basename(path); + TracyCZoneText(*(cur_block->tracy_ctx), filename, strlen(filename)); +#endif +} + +JL_DLLEXPORT void jl_timing_show_method_instance(jl_method_instance_t *mi, jl_timing_block_t *cur_block) +{ + jl_timing_show_func_sig(mi->specTypes, cur_block); + jl_method_t *def = mi->def.method; + jl_timing_printf(cur_block, "%s:%d in %s", + gnu_basename(jl_symbol_name(def->file)), + def->line, + jl_symbol_name(def->module->name)); +} + +JL_DLLEXPORT void jl_timing_show_func_sig(jl_value_t *v, jl_timing_block_t *cur_block) +{ +#ifdef USE_TRACY + ios_t buf; + ios_mem(&buf, IOS_INLSIZE); + buf.growable = 0; // Restrict to inline buffer to avoid allocation + + jl_static_show_func_sig((JL_STREAM*)&buf, v); + if (buf.size == buf.maxsize) + memset(&buf.buf[IOS_INLSIZE - 3], '.', 3); + + TracyCZoneText(*(cur_block->tracy_ctx), buf.buf, buf.size); +#endif +} + +JL_DLLEXPORT void jl_timing_printf(jl_timing_block_t *cur_block, const char *format, ...) +{ + va_list args; + va_start(args, format); + +#ifdef USE_TRACY + ios_t buf; + ios_mem(&buf, IOS_INLSIZE); + buf.growable = 0; // Restrict to inline buffer to avoid allocation + + jl_vprintf((JL_STREAM*)&buf, format, args); + if (buf.size == buf.maxsize) + memset(&buf.buf[IOS_INLSIZE - 3], '.', 3); + + TracyCZoneText(*(cur_block->tracy_ctx), buf.buf, buf.size); +#endif + va_end(args); +} + +JL_DLLEXPORT int jl_timing_set_enable(const char *subsystem, uint8_t enabled) +{ + for (int i = 0; i < JL_TIMING_LAST; i++) { + if (strcmp(subsystem, jl_timing_names[i]) == 0) { + uint64_t subsystem_bit = (1ul << i); + if (enabled) { + jl_timing_enable_mask |= subsystem_bit; + } else { + jl_timing_enable_mask &= ~subsystem_bit; + } + return 0; + } + } + return -1; } #else void jl_init_timing(void) { } void jl_destroy_timing(void) { } +JL_DLLEXPORT int jl_timing_set_enable(const char *subsystem, uint8_t enabled) { return -1; } #endif diff --git a/src/timing.h b/src/timing.h index c9a1fd1fb4807..920440edf9da7 100644 --- a/src/timing.h +++ b/src/timing.h @@ -6,14 +6,41 @@ #ifdef __cplusplus extern "C" { #endif + void jl_init_timing(void); void jl_destroy_timing(void) JL_NOTSAFEPOINT; + +// Update the enable bit-mask to enable/disable tracing events for +// the subsystem in `jl_timing_names` matching the provided string. +// +// Returns -1 if no matching sub-system was found. +int jl_timing_set_enable(const char *subsystem, uint8_t enabled); + #ifdef __cplusplus } #endif -#ifndef ENABLE_TIMINGS -#define JL_TIMING(owner) +#ifdef __cplusplus +#define HAVE_TIMING_SUPPORT +#elif defined(_COMPILER_CLANG_) +#define HAVE_TIMING_SUPPORT +#elif defined(_COMPILER_GCC_) +#define HAVE_TIMING_SUPPORT +#endif + +#if !defined( ENABLE_TIMINGS ) || !defined( HAVE_TIMING_SUPPORT ) + +#define JL_TIMING(subsystem, event) +#define JL_TIMING_SUSPEND(subsystem, ct) +#define jl_timing_show(v, b) +#define jl_timing_show_module(m, b) +#define jl_timing_show_filename(f, b) +#define jl_timing_show_method_instance(mi, b) +#define jl_timing_show_func_sig(tt, b) +#define jl_timing_block_enter_task(ct, ptls, blk) +#define jl_timing_block_exit_task(ct, ptls) ((jl_timing_block_t *)NULL) +#define jl_pop_timing_block(blk) + #else #include "julia_assert.h" @@ -26,23 +53,28 @@ extern "C" { #endif void jl_print_timings(void); jl_timing_block_t *jl_pop_timing_block(jl_timing_block_t *cur_block); -void jl_timing_block_start(jl_timing_block_t *cur_block); -void jl_timing_block_stop(jl_timing_block_t *cur_block); +void jl_timing_block_enter_task(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t *prev_blk); +jl_timing_block_t *jl_timing_block_exit_task(jl_task_t *ct, jl_ptls_t ptls); + +// Add the output of `jl_static_show(x)` as a text annotation to the +// profiling region corresponding to `cur_block`. +// +// If larger than IOS_INLSIZE (~80 characters), text is truncated. +void jl_timing_show(jl_value_t *v, jl_timing_block_t *cur_block); +void jl_timing_show_module(jl_module_t *m, jl_timing_block_t *cur_block); +void jl_timing_show_filename(const char *path, jl_timing_block_t *cur_block); +void jl_timing_show_method_instance(jl_method_instance_t *mi, jl_timing_block_t *cur_block); +void jl_timing_show_func_sig(jl_value_t *v, jl_timing_block_t *cur_block); +void jl_timing_printf(jl_timing_block_t *cur_block, const char *format, ...); #ifdef __cplusplus } #endif #ifdef __cplusplus -#define HAVE_TIMING_SUPPORT -#elif defined(_COMPILER_CLANG_) -#define HAVE_TIMING_SUPPORT -#elif defined(_COMPILER_GCC_) -#define HAVE_TIMING_SUPPORT -#endif - -#ifndef HAVE_TIMING_SUPPORT -#define JL_TIMING(owner) +#define JL_TIMING_CURRENT_BLOCK (&__timing_block.block) #else +#define JL_TIMING_CURRENT_BLOCK (&__timing_block) +#endif #define JL_TIMING_OWNERS \ X(ROOT), \ @@ -77,23 +109,61 @@ enum jl_timing_owners { JL_TIMING_LAST }; -extern uint64_t jl_timing_data[(int)JL_TIMING_LAST]; -extern const char *jl_timing_names[(int)JL_TIMING_LAST]; +/** + * Timing back-ends differ in terms of whether they support nested + * and asynchronous events. + **/ + +/** + * Timing Backend: Aggregated timing counts (implemented in timing.c) + **/ + +#define USE_COUNTS + +#ifdef USE_COUNTS +#define _COUNTS_CTX_MEMBER jl_timing_counts_t counts_ctx; +#define _COUNTS_CTOR(block, owner) _jl_timing_counts_ctor(block, owner) +#define _COUNTS_DESTROY(block) _jl_timing_counts_destroy(block) +#define _COUNTS_START(block, t) _jl_timing_counts_start(block, t) +#define _COUNTS_STOP(block, t) _jl_timing_counts_stop(block, t) +#else +#define _COUNTS_CTX_MEMBER +#define _COUNTS_CTOR(block, owner) +#define _COUNTS_DESTROY(block) +#define _COUNTS_START(block, t) +#define _COUNTS_STOP(block, t) +#endif + +/** + * Timing Backend: Tracy + **/ -struct _jl_timing_block_t { // typedef in julia.h #ifdef USE_TRACY - TracyCZoneCtx *tracy_ctx; +#define _TRACY_CTX_MEMBER TracyCZoneCtx *tracy_ctx; +#define _TRACY_CTOR(context, name, enable) TracyCZoneN(__tracy_ctx, name, (enable)); \ + (context) = &__tracy_ctx +#define _TRACY_DESTROY(ctx) TracyCZoneEnd(*ctx) +#else +#define _TRACY_CTX_MEMBER +#define _TRACY_CTOR(context, name, enable) +#define _TRACY_DESTROY(block) #endif - jl_timing_block_t *prev; + +/** + * Implementation: Aggregated counts back-end + **/ + +extern uint64_t jl_timing_counts[(int)JL_TIMING_LAST]; +typedef struct _jl_timing_counts_t { uint64_t total; uint64_t t0; int owner; #ifdef JL_DEBUG_BUILD uint8_t running; #endif -}; +} jl_timing_counts_t; -STATIC_INLINE void _jl_timing_block_stop(jl_timing_block_t *block, uint64_t t) JL_NOTSAFEPOINT { +STATIC_INLINE void _jl_timing_counts_stop(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT { #ifdef JL_DEBUG_BUILD assert(block->running); block->running = 0; @@ -101,7 +171,7 @@ STATIC_INLINE void _jl_timing_block_stop(jl_timing_block_t *block, uint64_t t) J block->total += t - block->t0; } -STATIC_INLINE void _jl_timing_block_start(jl_timing_block_t *block, uint64_t t) JL_NOTSAFEPOINT { +STATIC_INLINE void _jl_timing_counts_start(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT { #ifdef JL_DEBUG_BUILD assert(!block->running); block->running = 1; @@ -109,40 +179,75 @@ STATIC_INLINE void _jl_timing_block_start(jl_timing_block_t *block, uint64_t t) block->t0 = t; } -STATIC_INLINE uint64_t _jl_timing_block_init(jl_timing_block_t *block, int owner) JL_NOTSAFEPOINT { - uint64_t t = cycleclock(); +STATIC_INLINE void _jl_timing_counts_ctor(jl_timing_counts_t *block, int owner) JL_NOTSAFEPOINT { block->owner = owner; block->total = 0; #ifdef JL_DEBUG_BUILD block->running = 0; #endif - _jl_timing_block_start(block, t); - return t; } +STATIC_INLINE void _jl_timing_counts_destroy(jl_timing_counts_t *block) JL_NOTSAFEPOINT { + jl_timing_counts[block->owner] += block->total; +} + +/** + * Top-level jl_timing implementation + **/ + +extern uint64_t jl_timing_enable_mask; +extern const char *jl_timing_names[(int)JL_TIMING_LAST]; +struct _jl_timing_block_t { // typedef in julia.h + struct _jl_timing_block_t *prev; + _TRACY_CTX_MEMBER + _COUNTS_CTX_MEMBER +}; + STATIC_INLINE void _jl_timing_block_ctor(jl_timing_block_t *block, int owner) JL_NOTSAFEPOINT { - uint64_t t = _jl_timing_block_init(block, owner); + uint64_t t = cycleclock(); + _COUNTS_CTOR(&block->counts_ctx, owner); + _COUNTS_START(&block->counts_ctx, t); + jl_task_t *ct = jl_current_task; jl_timing_block_t **prevp = &ct->ptls->timing_stack; block->prev = *prevp; - if (block->prev) - _jl_timing_block_stop(block->prev, t); + if (block->prev) { + _COUNTS_STOP(&block->prev->counts_ctx, t); + } *prevp = block; } STATIC_INLINE void _jl_timing_block_destroy(jl_timing_block_t *block) JL_NOTSAFEPOINT { -#ifdef USE_TRACY - TracyCZoneEnd(*(block->tracy_ctx)); -#endif uint64_t t = cycleclock(); + + _COUNTS_STOP(&block->counts_ctx, t); + _COUNTS_DESTROY(&block->counts_ctx); + _TRACY_DESTROY(block->tracy_ctx); + jl_task_t *ct = jl_current_task; - _jl_timing_block_stop(block, t); - jl_timing_data[block->owner] += block->total; jl_timing_block_t **pcur = &ct->ptls->timing_stack; assert(*pcur == block); *pcur = block->prev; - if (block->prev) - _jl_timing_block_start(block->prev, t); + if (block->prev) { + _COUNTS_START(&block->prev->counts_ctx, t); + } +} + +typedef struct _jl_timing_suspend_t { + jl_task_t *ct; +} jl_timing_suspend_t; + +STATIC_INLINE void _jl_timing_suspend_ctor(jl_timing_suspend_t *suspend, const char *subsystem, jl_task_t *ct) JL_NOTSAFEPOINT { + suspend->ct = ct; +#ifdef USE_TRACY + TracyCFiberEnter(subsystem); +#endif +} + +STATIC_INLINE void _jl_timing_suspend_destroy(jl_timing_suspend_t *suspend) JL_NOTSAFEPOINT { +#ifdef USE_TRACY + TracyCFiberEnter(suspend->ct->name); +#endif } #ifdef __cplusplus @@ -159,30 +264,38 @@ struct jl_timing_block_cpp_t { jl_timing_block_cpp_t& operator=(const jl_timing_block_cpp_t &) = delete; jl_timing_block_cpp_t& operator=(const jl_timing_block_cpp_t &&) = delete; }; -#ifdef USE_TRACY -#define JL_TIMING(owner) jl_timing_block_cpp_t __timing_block(JL_TIMING_ ## owner); \ - TracyCZoneN(__tracy_ctx, #owner, strcmp(#owner, "ROOT")); \ - __timing_block.block.tracy_ctx = &__tracy_ctx; -#else -#define JL_TIMING(owner) jl_timing_block_cpp_t __timing_block(JL_TIMING_ ## owner) -#endif -#else -#ifdef USE_TRACY -#define JL_TIMING(owner) \ - __attribute__((cleanup(_jl_timing_block_destroy))) \ - jl_timing_block_t __timing_block; \ - _jl_timing_block_ctor(&__timing_block, JL_TIMING_ ## owner); \ - TracyCZoneN(__tracy_ctx, #owner, 1); \ - __timing_block.tracy_ctx = &__tracy_ctx; +#define JL_TIMING(subsystem, event) jl_timing_block_cpp_t __timing_block(JL_TIMING_ ## subsystem); \ + _TRACY_CTOR(__timing_block.block.tracy_ctx, #event, (jl_timing_enable_mask >> (JL_TIMING_ ## subsystem)) & 1) #else -#define JL_TIMING(owner) \ +#define JL_TIMING(subsystem, event) \ __attribute__((cleanup(_jl_timing_block_destroy))) \ jl_timing_block_t __timing_block; \ - _jl_timing_block_ctor(&__timing_block, JL_TIMING_ ## owner) -#endif + _jl_timing_block_ctor(&__timing_block, JL_TIMING_ ## subsystem); \ + _TRACY_CTOR(__timing_block.tracy_ctx, #event, (jl_timing_enable_mask >> (JL_TIMING_ ## subsystem)) & 1) #endif +#ifdef __cplusplus +struct jl_timing_suspend_cpp_t { + jl_timing_suspend_t suspend; + jl_timing_suspend_cpp_t(const char *subsystem, jl_task_t *ct) JL_NOTSAFEPOINT { + _jl_timing_suspend_ctor(&suspend, subsystem, ct); + } + ~jl_timing_suspend_cpp_t() JL_NOTSAFEPOINT { + _jl_timing_suspend_destroy(&suspend); + } + jl_timing_suspend_cpp_t(const jl_timing_block_cpp_t&) = delete; + jl_timing_suspend_cpp_t(const jl_timing_block_cpp_t&&) = delete; + jl_timing_suspend_cpp_t& operator=(const jl_timing_block_cpp_t &) = delete; + jl_timing_suspend_cpp_t& operator=(const jl_timing_block_cpp_t &&) = delete; +}; +#define JL_TIMING_SUSPEND(subsystem, ct) jl_timing_suspend_cpp_t __suspend_block(#subsystem, ct) +#else +#define JL_TIMING_SUSPEND(subsystem, ct) \ + __attribute__((cleanup(_jl_timing_suspend_destroy))) \ + jl_timing_suspend_t __timing_suspend; \ + _jl_timing_suspend_ctor(&__timing_suspend, #subsystem, ct) #endif + #endif #endif diff --git a/src/toplevel.c b/src/toplevel.c index 6f29c0a82d617..ad282a50d91ac 100644 --- a/src/toplevel.c +++ b/src/toplevel.c @@ -64,7 +64,8 @@ static jl_function_t *jl_module_get_initializer(jl_module_t *m JL_PROPAGATES_ROO void jl_module_run_initializer(jl_module_t *m) { - JL_TIMING(INIT_MODULE); + JL_TIMING(INIT_MODULE, INIT_MODULE); + jl_timing_show_module(m, JL_TIMING_CURRENT_BLOCK); jl_function_t *f = jl_module_get_initializer(m); if (f == NULL) return; From e9ac58403833bdf778868d6f269ddedd3e828c2e Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Tue, 28 Mar 2023 10:53:34 -0400 Subject: [PATCH 04/12] Add `WAIT_FOR_TRACY` environment variable This environmental variable instructs the Julia runtime to wait until it receives a connection from the Tracy profiler before starting. This allows us to get a complete trace, without having to accumulate a large queue of events to send over the wire later. --- src/jlapi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/jlapi.c b/src/jlapi.c index 5c6f01ab86a88..8f5e3e6cb13dc 100644 --- a/src/jlapi.c +++ b/src/jlapi.c @@ -15,6 +15,10 @@ #include "julia_assert.h" #include "julia_internal.h" +#ifdef USE_TRACY +#include "tracy/TracyC.h" +#endif + #ifdef __cplusplus #include extern "C" { @@ -680,6 +684,11 @@ static void rr_detach_teleport(void) { JL_DLLEXPORT int jl_repl_entrypoint(int argc, char *argv[]) { +#ifdef USE_TRACY + if (getenv("WAIT_FOR_TRACY")) + while (!TracyCIsConnected) ; // Wait for connection +#endif + // no-op on Windows, note that the caller must have already converted // from `wchar_t` to `UTF-8` already if we're running on Windows. uv_setup_args(argc, argv); From c90b266b39502debff43f608f50919332b650598 Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Wed, 5 Apr 2023 06:45:21 -0400 Subject: [PATCH 05/12] deps: Add `TracyPlotConfig` patch for Tracy This functionality is currently only exposed on the master branch of Tracy, but it's quite useful so let's carry the patch to expose it until the next Tracy release. --- deps/libtracyclient.mk | 7 ++- .../patches/libTracyClient-config-plots.patch | 48 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 deps/patches/libTracyClient-config-plots.patch diff --git a/deps/libtracyclient.mk b/deps/libtracyclient.mk index aee5d6e969770..693df1e20fbfb 100644 --- a/deps/libtracyclient.mk +++ b/deps/libtracyclient.mk @@ -36,7 +36,12 @@ $(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-no-crash-handler.patch-applied: $(LIBT patch -p1 -f < $(SRCDIR)/patches/libTracyClient-no-crash-handler.patch echo 1 > $@ -$(LIBTRACYCLIENT_BUILDDIR)/build-configured: $(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-no-crash-handler.patch-applied +$(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-config-plots.patch-applied: $(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-no-crash-handler.patch-applied + cd $(LIBTRACYCLIENT_BUILDDIR) && \ + patch -p1 -f < $(SRCDIR)/patches/libTracyClient-config-plots.patch + echo 1 > $@ + +$(LIBTRACYCLIENT_BUILDDIR)/build-configured: $(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-config-plots.patch-applied mkdir -p $(dir $@) cd $(dir $@) && \ $(CMAKE) . $(CMAKE_GENERATOR_COMMAND) $(CMAKE_COMMON) $(LIBTRACYCLIENT_CMAKE) \ diff --git a/deps/patches/libTracyClient-config-plots.patch b/deps/patches/libTracyClient-config-plots.patch new file mode 100644 index 0000000000000..ff54f642d36ca --- /dev/null +++ b/deps/patches/libTracyClient-config-plots.patch @@ -0,0 +1,48 @@ +commit 7151c6afd9cc40877325c64bd19bcff7211fbd59 +Author: Bartosz Taudul +Date: Wed Mar 8 23:18:36 2023 +0100 + + Add support for configuring plots to C API. + +diff --git a/public/client/TracyProfiler.cpp b/public/client/TracyProfiler.cpp +index 6104a7ed..38b5ea13 100644 +--- a/public/client/TracyProfiler.cpp ++++ b/public/client/TracyProfiler.cpp +@@ -4149,4 +4149,5 @@ TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_ + TRACY_API void ___tracy_emit_plot( const char* name, double val ) { tracy::Profiler::PlotData( name, val ); } ++TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color ) { tracy::Profiler::ConfigurePlot( name, tracy::PlotFormatType(type), step, fill, color ); } + TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ) { tracy::Profiler::Message( txt, size, callstack ); } + TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ) { tracy::Profiler::Message( txt, callstack ); } + TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, size, color, callstack ); } +diff --git a/public/tracy/TracyC.h b/public/tracy/TracyC.h +index bedf5e16..736b51ed 100644 +--- a/public/tracy/TracyC.h ++++ b/public/tracy/TracyC.h +@@ -11,6 +11,13 @@ + extern "C" { + #endif + ++enum TracyPlotFormatEnum ++{ ++ TracyPlotFormatNumber, ++ TracyPlotFormatMemory, ++ TracyPlotFormatPercentage, ++}; ++ + TRACY_API void ___tracy_set_thread_name( const char* name ); + + #define TracyCSetThreadName( name ) ___tracy_set_thread_name( name ); +@@ -49,4 +56,5 @@ typedef const void* TracyCZoneCtx; + #define TracyCPlot(x,y) ++#define TracyCPlotConfig(x,y,z,w,a) + #define TracyCMessage(x,y) + #define TracyCMessageL(x) + #define TracyCMessageC(x,y,z) +@@ -276,7 +284,9 @@ TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_ + TRACY_API void ___tracy_emit_plot( const char* name, double val ); ++TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color ); + TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ); + + #define TracyCPlot( name, val ) ___tracy_emit_plot( name, val ); ++#define TracyCPlotConfig( name, type, step, fill, color ) ___tracy_emit_plot_config( name, type, step, fill, color ); + #define TracyCAppInfo( txt, size ) ___tracy_emit_message_appinfo( txt, size ); From fa20904406a7dde60b59eefb416311eb2a33ab3d Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Wed, 5 Apr 2023 06:46:51 -0400 Subject: [PATCH 06/12] rtutils: Add "quiet" type-printing for `jl_static_show` This is useful for Tracy, which can receive method signature strings over the write. For performance reasons, we want to keep those strings as small as possible, typically less than 80 characters long. This change adds basic "configuration" support for `jl_static_show`, including only a "quiet" parameter for now which can be used to avoid printing DataType parameters. --- src/julia_internal.h | 3 + src/rtutils.c | 141 ++++++++++++++++++++++--------------------- src/timing.c | 3 +- 3 files changed, 77 insertions(+), 70 deletions(-) diff --git a/src/julia_internal.h b/src/julia_internal.h index 73f8b9467fcf4..80a7339fe458b 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -583,6 +583,9 @@ void jl_gc_reset_alloc_count(void); uint32_t jl_get_gs_ctr(void); void jl_set_gs_ctr(uint32_t ctr); +typedef struct _jl_static_show_config_t { uint8_t quiet; } jl_static_show_config_t; +size_t jl_static_show_func_sig_(JL_STREAM *s, jl_value_t *type, jl_static_show_config_t ctx) JL_NOTSAFEPOINT; + STATIC_INLINE jl_value_t *undefref_check(jl_datatype_t *dt, jl_value_t *v) JL_NOTSAFEPOINT { if (dt->layout->first_ptr >= 0) { diff --git a/src/rtutils.c b/src/rtutils.c index 41e45cd468039..a2ec34102f148 100644 --- a/src/rtutils.c +++ b/src/rtutils.c @@ -538,14 +538,23 @@ JL_DLLEXPORT jl_value_t *jl_stderr_obj(void) JL_NOTSAFEPOINT // toys for debugging --------------------------------------------------------- -static size_t jl_show_svec(JL_STREAM *out, jl_svec_t *t, const char *head, const char *opn, const char *cls) JL_NOTSAFEPOINT +struct recur_list { + struct recur_list *prev; + jl_value_t *v; +}; + +static size_t jl_static_show_x(JL_STREAM *out, jl_value_t *v, struct recur_list *depth, jl_static_show_config_t ctx) JL_NOTSAFEPOINT; +static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt, struct recur_list *depth, jl_static_show_config_t ctx) JL_NOTSAFEPOINT; +static size_t jl_static_show_next_(JL_STREAM *out, jl_value_t *v, jl_value_t *prev, struct recur_list *depth, jl_static_show_config_t ctx) JL_NOTSAFEPOINT; + +static size_t jl_show_svec(JL_STREAM *out, jl_svec_t *t, const char *head, const char *opn, const char *cls, jl_static_show_config_t ctx) JL_NOTSAFEPOINT { size_t i, n=0, len = jl_svec_len(t); n += jl_printf(out, "%s", head); n += jl_printf(out, "%s", opn); for (i = 0; i < len; i++) { jl_value_t *v = jl_svecref(t,i); - n += jl_static_show(out, v); + n += jl_static_show_x(out, v, 0, ctx); if (i != len-1) n += jl_printf(out, ", "); } @@ -553,14 +562,6 @@ static size_t jl_show_svec(JL_STREAM *out, jl_svec_t *t, const char *head, const return n; } -struct recur_list { - struct recur_list *prev; - jl_value_t *v; -}; - -static size_t jl_static_show_x(JL_STREAM *out, jl_value_t *v, struct recur_list *depth) JL_NOTSAFEPOINT; -static size_t jl_static_show_next_(JL_STREAM *out, jl_value_t *v, jl_value_t *prev, struct recur_list *depth) JL_NOTSAFEPOINT; - JL_DLLEXPORT int jl_id_start_char(uint32_t wc) JL_NOTSAFEPOINT; JL_DLLEXPORT int jl_id_char(uint32_t wc) JL_NOTSAFEPOINT; @@ -697,7 +698,7 @@ static int jl_static_is_function_(jl_datatype_t *vt) JL_NOTSAFEPOINT { // This is necessary to make sure that this function doesn't allocate any // memory through the Julia GC static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt, - struct recur_list *depth) JL_NOTSAFEPOINT + struct recur_list *depth, jl_static_show_config_t ctx) JL_NOTSAFEPOINT { size_t n = 0; if ((uintptr_t)vt < 4096U) { @@ -705,7 +706,7 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt } else if ((uintptr_t)v < 4096U) { n += jl_printf(out, ""); } else if (vt == (jl_datatype_t*)jl_buff_tag) { @@ -746,17 +747,17 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt n += jl_static_show_func_sig(out, li->def.method->sig); } else { - n += jl_static_show_x(out, (jl_value_t*)li->def.module, depth); + n += jl_static_show_x(out, (jl_value_t*)li->def.module, depth, ctx); n += jl_printf(out, ". -> "); - n += jl_static_show_x(out, jl_atomic_load_relaxed(&li->uninferred), depth); + n += jl_static_show_x(out, jl_atomic_load_relaxed(&li->uninferred), depth, ctx); } } else if (vt == jl_typename_type) { - n += jl_static_show_x(out, jl_unwrap_unionall(((jl_typename_t*)v)->wrapper), depth); + n += jl_static_show_x(out, jl_unwrap_unionall(((jl_typename_t*)v)->wrapper), depth, ctx); n += jl_printf(out, ".name"); } else if (vt == jl_simplevector_type) { - n += jl_show_svec(out, (jl_svec_t*)v, "svec", "(", ")"); + n += jl_show_svec(out, (jl_svec_t*)v, "svec", "(", ")", ctx); } else if (v == (jl_value_t*)jl_unionall_type) { // avoid printing `typeof(Type)` for `UnionAll`. @@ -767,10 +768,10 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt n += jl_printf(out, "Vararg"); if (vm->T) { n += jl_printf(out, "{"); - n += jl_static_show_x(out, vm->T, depth); + n += jl_static_show_x(out, vm->T, depth, ctx); if (vm->N) { n += jl_printf(out, ", "); - n += jl_static_show_x(out, vm->N, depth); + n += jl_static_show_x(out, vm->N, depth, ctx); } n += jl_printf(out, "}"); } @@ -797,7 +798,7 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt } if (taillen == tlen && taillen > 3) { n += jl_printf(out, "NTuple{%d, ", tlen); - n += jl_static_show_x(out, jl_tparam0(dv), depth); + n += jl_static_show_x(out, jl_tparam0(dv), depth, ctx); n += jl_printf(out, "}"); } else { @@ -805,22 +806,25 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt for (i = 0; i < (taillen > 3 ? tlen-taillen : tlen); i++) { if (i > 0) n += jl_printf(out, ", "); - n += jl_static_show_x(out, jl_tparam(dv, i), depth); + n += jl_static_show_x(out, jl_tparam(dv, i), depth, ctx); } if (taillen > 3) { n += jl_printf(out, ", Vararg{"); - n += jl_static_show_x(out, jl_tparam(dv, tlen-1), depth); + n += jl_static_show_x(out, jl_tparam(dv, tlen-1), depth, ctx); n += jl_printf(out, ", %d}", taillen); } n += jl_printf(out, "}"); } return n; } + if (ctx.quiet) { + return jl_printf(out, "%s", jl_symbol_name(dv->name->name)); + } if (globfunc) { n += jl_printf(out, "typeof("); } if (jl_core_module && (dv->name->module != jl_core_module || !jl_module_exports_p(jl_core_module, sym))) { - n += jl_static_show_x(out, (jl_value_t*)dv->name->module, depth); + n += jl_static_show_x(out, (jl_value_t*)dv->name->module, depth, ctx); n += jl_printf(out, "."); size_t i = 0; if (globfunc && !jl_id_start_char(u8_nextchar(sn, &i))) { @@ -841,7 +845,7 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt n += jl_printf(out, "{"); for (j = 0; j < tlen; j++) { jl_value_t *p = jl_tparam(dv,j); - n += jl_static_show_x(out, p, depth); + n += jl_static_show_x(out, p, depth, ctx); if (j != tlen-1) n += jl_printf(out, ", "); } @@ -908,22 +912,22 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt n += jl_printf(out, "Union{"); while (jl_is_uniontype(v)) { // tail-recurse on b to flatten the printing of the Union structure in the common case - n += jl_static_show_x(out, ((jl_uniontype_t*)v)->a, depth); + n += jl_static_show_x(out, ((jl_uniontype_t*)v)->a, depth, ctx); n += jl_printf(out, ", "); v = ((jl_uniontype_t*)v)->b; } - n += jl_static_show_x(out, v, depth); + n += jl_static_show_x(out, v, depth, ctx); n += jl_printf(out, "}"); } else if (vt == jl_unionall_type) { jl_unionall_t *ua = (jl_unionall_t*)v; - n += jl_static_show_x(out, ua->body, depth); + n += jl_static_show_x(out, ua->body, depth, ctx); n += jl_printf(out, " where "); - n += jl_static_show_x(out, (jl_value_t*)ua->var, depth->prev); + n += jl_static_show_x(out, (jl_value_t*)ua->var, depth->prev, ctx); } else if (vt == jl_typename_type) { n += jl_printf(out, "typename("); - n += jl_static_show_x(out, jl_unwrap_unionall(((jl_typename_t*)v)->wrapper), depth); + n += jl_static_show_x(out, jl_unwrap_unionall(((jl_typename_t*)v)->wrapper), depth, ctx); n += jl_printf(out, ")"); } else if (vt == jl_tvar_type) { @@ -943,7 +947,7 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt int ua = jl_is_unionall(lb); if (ua) n += jl_printf(out, "("); - n += jl_static_show_x(out, lb, depth); + n += jl_static_show_x(out, lb, depth, ctx); if (ua) n += jl_printf(out, ")"); n += jl_printf(out, "<:"); @@ -955,7 +959,7 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt n += jl_printf(out, "<:"); if (ua) n += jl_printf(out, "("); - n += jl_static_show_x(out, ub, depth); + n += jl_static_show_x(out, ub, depth, ctx); if (ua) n += jl_printf(out, ")"); } @@ -963,7 +967,7 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt else if (vt == jl_module_type) { jl_module_t *m = (jl_module_t*)v; if (m->parent != m && m->parent != jl_main_module) { - n += jl_static_show_x(out, (jl_value_t*)m->parent, depth); + n += jl_static_show_x(out, (jl_value_t*)m->parent, depth, ctx); n += jl_printf(out, "."); } n += jl_printf(out, "%s", jl_symbol_name(m->name)); @@ -984,7 +988,7 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt (uintptr_t)((jl_ssavalue_t*)v)->id); } else if (vt == jl_globalref_type) { - n += jl_static_show_x(out, (jl_value_t*)jl_globalref_mod(v), depth); + n += jl_static_show_x(out, (jl_value_t*)jl_globalref_mod(v), depth, ctx); char *name = jl_symbol_name(jl_globalref_name(v)); n += jl_printf(out, jl_is_identifier(name) ? ".%s" : ".:(%s)", name); } @@ -999,7 +1003,7 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt else { n += jl_printf(out, ":("); } - n += jl_static_show_x(out, qv, depth); + n += jl_static_show_x(out, qv, depth, ctx); if (!jl_is_symbol(qv)) { n += jl_printf(out, " end"); } @@ -1009,20 +1013,20 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt } else if (vt == jl_newvarnode_type) { n += jl_printf(out, ""); } else if (vt == jl_linenumbernode_type) { n += jl_printf(out, "#= "); - n += jl_static_show_x(out, jl_linenode_file(v), depth); + n += jl_static_show_x(out, jl_linenode_file(v), depth, ctx); n += jl_printf(out, ":%" PRIuPTR " =#", jl_linenode_line(v)); } else if (vt == jl_expr_type) { jl_expr_t *e = (jl_expr_t*)v; if (e->head == jl_assign_sym && jl_array_len(e->args) == 2) { - n += jl_static_show_x(out, jl_exprarg(e,0), depth); + n += jl_static_show_x(out, jl_exprarg(e,0), depth, ctx); n += jl_printf(out, " = "); - n += jl_static_show_x(out, jl_exprarg(e,1), depth); + n += jl_static_show_x(out, jl_exprarg(e,1), depth, ctx); } else { char sep = ' '; @@ -1030,14 +1034,14 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt size_t i, len = jl_array_len(e->args); for (i = 0; i < len; i++) { n += jl_printf(out, ",%c", sep); - n += jl_static_show_x(out, jl_exprarg(e,i), depth); + n += jl_static_show_x(out, jl_exprarg(e,i), depth, ctx); } n += jl_printf(out, ")"); } } else if (jl_array_type && jl_is_array_type(vt)) { n += jl_printf(out, "Array{"); - n += jl_static_show_x(out, (jl_value_t*)jl_tparam0(vt), depth); + n += jl_static_show_x(out, (jl_value_t*)jl_tparam0(vt), depth, ctx); n += jl_printf(out, ", ("); size_t i, ndims = jl_array_ndims(v); if (ndims == 1) @@ -1072,13 +1076,13 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt for (j = 0; j < tlen; j++) { if (av->flags.ptrarray) { jl_value_t **ptr = ((jl_value_t**)av->data) + j; - n += jl_static_show_x(out, *ptr, depth); + n += jl_static_show_x(out, *ptr, depth, ctx); } else { char *ptr = ((char*)av->data) + j * av->elsize; n += jl_static_show_x_(out, (jl_value_t*)ptr, typetagdata ? (jl_datatype_t*)jl_nth_union_component(el_type, typetagdata[j]) : (jl_datatype_t*)el_type, - depth); + depth, ctx); } if (j != tlen - 1) n += jl_printf(out, nlsep ? ",\n " : ", "); @@ -1087,16 +1091,16 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt } else if (vt == jl_loaderror_type) { n += jl_printf(out, "LoadError(at "); - n += jl_static_show_x(out, *(jl_value_t**)v, depth); + n += jl_static_show_x(out, *(jl_value_t**)v, depth, ctx); // Access the field directly to avoid allocation n += jl_printf(out, " line %" PRIdPTR, ((intptr_t*)v)[1]); n += jl_printf(out, ": "); - n += jl_static_show_x(out, ((jl_value_t**)v)[2], depth); + n += jl_static_show_x(out, ((jl_value_t**)v)[2], depth, ctx); n += jl_printf(out, ")"); } else if (vt == jl_errorexception_type) { n += jl_printf(out, "ErrorException("); - n += jl_static_show_x(out, *(jl_value_t**)v, depth); + n += jl_static_show_x(out, *(jl_value_t**)v, depth, ctx); n += jl_printf(out, ")"); } else if (jl_static_is_function_(vt) && is_globname_binding(v, (jl_datatype_t*)vt)) { @@ -1106,7 +1110,7 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt int globfunc = is_globfunction(v, dv, &sym); int quote = 0; if (jl_core_module && (dv->name->module != jl_core_module || !jl_module_exports_p(jl_core_module, sym))) { - n += jl_static_show_x(out, (jl_value_t*)dv->name->module, depth); + n += jl_static_show_x(out, (jl_value_t*)dv->name->module, depth, ctx); n += jl_printf(out, "."); size_t i = 0; @@ -1136,7 +1140,7 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt n += jl_printf(out, "NamedTuple"); } else if (!istuple) { - n += jl_static_show_x(out, (jl_value_t*)vt, depth); + n += jl_static_show_x(out, (jl_value_t*)vt, depth, ctx); } n += jl_printf(out, "("); size_t nb = jl_datatype_size(vt); @@ -1159,7 +1163,7 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt size_t offs = jl_field_offset(vt, i); char *fld_ptr = (char*)v + offs; if (jl_field_isptr(vt, i)) { - n += jl_static_show_x(out, *(jl_value_t**)fld_ptr, depth); + n += jl_static_show_x(out, *(jl_value_t**)fld_ptr, depth, ctx); } else { jl_datatype_t *ft = (jl_datatype_t*)jl_field_type_concrete(vt, i); @@ -1167,7 +1171,7 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt uint8_t sel = ((uint8_t*)fld_ptr)[jl_field_size(vt, i) - 1]; ft = (jl_datatype_t*)jl_nth_union_component((jl_value_t*)ft, sel); } - n += jl_static_show_x_(out, (jl_value_t*)fld_ptr, ft, depth); + n += jl_static_show_x_(out, (jl_value_t*)fld_ptr, ft, depth, ctx); } if ((istuple || isnamedtuple) && tlen == 1) n += jl_printf(out, ","); @@ -1177,26 +1181,26 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt if (vt == jl_typemap_entry_type) { n += jl_printf(out, ", next=↩︎\n "); jl_value_t *next = (jl_value_t*)jl_atomic_load_relaxed(&((jl_typemap_entry_t*)v)->next); - n += jl_static_show_next_(out, next, v, depth); + n += jl_static_show_next_(out, next, v, depth, ctx); } } n += jl_printf(out, ")"); } else { n += jl_printf(out, ""); } return n; } -static size_t jl_static_show_x(JL_STREAM *out, jl_value_t *v, struct recur_list *depth) JL_NOTSAFEPOINT +static size_t jl_static_show_x(JL_STREAM *out, jl_value_t *v, struct recur_list *depth, jl_static_show_config_t ctx) JL_NOTSAFEPOINT { // show values without calling a julia method or allocating through the GC - return jl_static_show_next_(out, v, NULL, depth); + return jl_static_show_next_(out, v, NULL, depth, ctx); } -static size_t jl_static_show_next_(JL_STREAM *out, jl_value_t *v, jl_value_t *prev, struct recur_list *depth) JL_NOTSAFEPOINT +static size_t jl_static_show_next_(JL_STREAM *out, jl_value_t *v, jl_value_t *prev, struct recur_list *depth, jl_static_show_config_t ctx) JL_NOTSAFEPOINT { // helper for showing a typemap list by following the next pointers // while being careful about avoiding any recursion due to malformed (circular) references @@ -1217,7 +1221,7 @@ static size_t jl_static_show_next_(JL_STREAM *out, jl_value_t *v, jl_value_t *pr while (m && jl_typeis(m, jl_typemap_entry_type)) { if (m == v) { return jl_printf(out, "sig, depth) + + jl_static_show_x(out, (jl_value_t*)((jl_typemap_entry_t*)m)->sig, depth, ctx) + jl_printf(out, ">"); } if (m == prev) { @@ -1248,15 +1252,22 @@ static size_t jl_static_show_next_(JL_STREAM *out, jl_value_t *v, jl_value_t *pr dist++; p = p->prev; } - return jl_static_show_x_(out, v, (jl_datatype_t*)jl_typeof(v), newdepth); + return jl_static_show_x_(out, v, (jl_datatype_t*)jl_typeof(v), newdepth, ctx); } JL_DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) JL_NOTSAFEPOINT { - return jl_static_show_x(out, v, 0); + jl_static_show_config_t ctx = { /* quiet */ 0 }; + return jl_static_show_x(out, v, 0, ctx); } JL_DLLEXPORT size_t jl_static_show_func_sig(JL_STREAM *s, jl_value_t *type) JL_NOTSAFEPOINT +{ + jl_static_show_config_t ctx = { /* quiet */ 0 }; + return jl_static_show_func_sig_(s, type, ctx); +} + +JL_DLLEXPORT size_t jl_static_show_func_sig_(JL_STREAM *s, jl_value_t *type, jl_static_show_config_t ctx) JL_NOTSAFEPOINT { size_t n = 0; size_t i; @@ -1286,7 +1297,7 @@ JL_DLLEXPORT size_t jl_static_show_func_sig(JL_STREAM *s, jl_value_t *type) JL_N } else { n += jl_printf(s, "(::"); - n += jl_static_show_x(s, ftype, depth); + n += jl_static_show_x(s, ftype, depth, ctx); n += jl_printf(s, ")"); } size_t tl = jl_nparams(type); @@ -1294,11 +1305,7 @@ JL_DLLEXPORT size_t jl_static_show_func_sig(JL_STREAM *s, jl_value_t *type) JL_N for (i = 1; i < tl; i++) { jl_value_t *tp = jl_tparam(type, i); if (i != tl - 1) { - if (jl_is_datatype(tp)) { - n += jl_printf(s, "%s", jl_symbol_name(((jl_datatype_t*)tp)->name->name)); - } else { - n += jl_static_show_x(s, tp, depth); - } + n += jl_static_show_x(s, tp, depth, ctx); n += jl_printf(s, ", "); } else { @@ -1306,17 +1313,13 @@ JL_DLLEXPORT size_t jl_static_show_func_sig(JL_STREAM *s, jl_value_t *type) JL_N tp = jl_unwrap_vararg(tp); if (jl_is_unionall(tp)) n += jl_printf(s, "("); - if (jl_is_datatype(tp)) { - n += jl_printf(s, "%s", jl_symbol_name(((jl_datatype_t*)tp)->name->name)); - } else { - n += jl_static_show_x(s, tp, depth); - } + n += jl_static_show_x(s, tp, depth, ctx); if (jl_is_unionall(tp)) n += jl_printf(s, ")"); n += jl_printf(s, "..."); } else { - n += jl_static_show_x(s, tp, depth); + n += jl_static_show_x(s, tp, depth, ctx); } } } @@ -1328,7 +1331,7 @@ JL_DLLEXPORT size_t jl_static_show_func_sig(JL_STREAM *s, jl_value_t *type) JL_N while (jl_is_unionall(tvars)) { if (!first) n += jl_printf(s, ", "); - n += jl_static_show_x(s, (jl_value_t*)tvars->var, first ? NULL : depth); + n += jl_static_show_x(s, (jl_value_t*)tvars->var, first ? NULL : depth, ctx); tvars = (jl_unionall_t*)tvars->body; if (!first) depth += 1; diff --git a/src/timing.c b/src/timing.c index b2a483747dc6e..c3ba3809884bf 100644 --- a/src/timing.c +++ b/src/timing.c @@ -176,7 +176,8 @@ JL_DLLEXPORT void jl_timing_show_func_sig(jl_value_t *v, jl_timing_block_t *cur_ ios_mem(&buf, IOS_INLSIZE); buf.growable = 0; // Restrict to inline buffer to avoid allocation - jl_static_show_func_sig((JL_STREAM*)&buf, v); + jl_static_show_config_t config = { /* quiet */ 1 }; + jl_static_show_func_sig_((JL_STREAM*)&buf, v, config); if (buf.size == buf.maxsize) memset(&buf.buf[IOS_INLSIZE - 3], '.', 3); From 3b68d4908867b20200a72e93bd988ae04b12c72f Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Wed, 5 Apr 2023 06:58:54 -0400 Subject: [PATCH 07/12] profiling: Add `WITH_TIMING_COUNTS` build option This replaces the `ENABLE_TIMINGS` #define in `options.h` Since we now support multiple back-ends for timing events, each back-end is expected to be enabled with a `WITH_*` Makefile flag, and then the JL_TIMING API is automatically enabled if any back-end is available. Turning on multiple back-ends at the same time is also supported. --- Make.inc | 8 ++++++++ src/init.c | 2 +- src/options.h | 3 --- src/timing.h | 12 +++++++----- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/Make.inc b/Make.inc index 4d91cd9df9525..a5cf1c3592579 100644 --- a/Make.inc +++ b/Make.inc @@ -95,6 +95,9 @@ WITH_ITTAPI := 0 # Enable Tracy support WITH_TRACY := 0 +# Enable Timing Counts support +WITH_TIMING_COUNTS := 0 + # Prevent picking up $ARCH from the environment variables ARCH:= @@ -743,6 +746,11 @@ JCFLAGS += -DUSE_TRACY -DTRACY_ENABLE -DTRACY_FIBERS LIBTRACYCLIENT:=-lTracyClient endif +ifeq ($(WITH_TIMING_COUNTS), 1) +JCXXFLAGS += -DUSE_TIMING_COUNTS +JCFLAGS += -DUSE_TIMING_COUNTS +endif + # =========================================================================== # Select the cpu architecture to target, or automatically detects the user's compiler diff --git a/src/init.c b/src/init.c index 5990bd24aaabd..efa2d51110548 100644 --- a/src/init.c +++ b/src/init.c @@ -349,7 +349,7 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) JL_NOTSAFEPOINT_ENTER // TODO: Destroy threads? jl_destroy_timing(); // cleans up the current timing_stack for noreturn -#ifdef ENABLE_TIMINGS +#ifdef USE_TIMING_COUNTS jl_print_timings(); #endif jl_teardown_codegen(); // prints stats diff --git a/src/options.h b/src/options.h index 12822b6ecf8b0..5253bcab0456f 100644 --- a/src/options.h +++ b/src/options.h @@ -78,9 +78,6 @@ // OBJPROFILE counts objects by type // #define OBJPROFILE -// Automatic Instrumenting Profiler -#define ENABLE_TIMINGS - // method dispatch profiling -------------------------------------------------- diff --git a/src/timing.h b/src/timing.h index 920440edf9da7..eccec059cf667 100644 --- a/src/timing.h +++ b/src/timing.h @@ -28,6 +28,10 @@ int jl_timing_set_enable(const char *subsystem, uint8_t enabled); #define HAVE_TIMING_SUPPORT #endif +#if defined( USE_TRACY ) || defined( USE_TIMING_COUNTS ) +#define ENABLE_TIMINGS +#endif + #if !defined( ENABLE_TIMINGS ) || !defined( HAVE_TIMING_SUPPORT ) #define JL_TIMING(subsystem, event) @@ -118,9 +122,7 @@ enum jl_timing_owners { * Timing Backend: Aggregated timing counts (implemented in timing.c) **/ -#define USE_COUNTS - -#ifdef USE_COUNTS +#ifdef USE_TIMING_COUNTS #define _COUNTS_CTX_MEMBER jl_timing_counts_t counts_ctx; #define _COUNTS_CTOR(block, owner) _jl_timing_counts_ctor(block, owner) #define _COUNTS_DESTROY(block) _jl_timing_counts_destroy(block) @@ -204,7 +206,7 @@ struct _jl_timing_block_t { // typedef in julia.h }; STATIC_INLINE void _jl_timing_block_ctor(jl_timing_block_t *block, int owner) JL_NOTSAFEPOINT { - uint64_t t = cycleclock(); + uint64_t t = cycleclock(); (void)t; _COUNTS_CTOR(&block->counts_ctx, owner); _COUNTS_START(&block->counts_ctx, t); @@ -218,7 +220,7 @@ STATIC_INLINE void _jl_timing_block_ctor(jl_timing_block_t *block, int owner) JL } STATIC_INLINE void _jl_timing_block_destroy(jl_timing_block_t *block) JL_NOTSAFEPOINT { - uint64_t t = cycleclock(); + uint64_t t = cycleclock(); (void)t; _COUNTS_STOP(&block->counts_ctx, t); _COUNTS_DESTROY(&block->counts_ctx); From 9ca700e06e4b9de4d4b292ce8eb6d3294bd1b353 Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Wed, 5 Apr 2023 09:06:43 -0400 Subject: [PATCH 08/12] profiling: limit reported method instances Cap the number of method instances we report to the profiler for a single LLVM_MODULE_FINISH to 10. --- src/jitlayers.cpp | 8 +++++++- src/timing.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 7c68b78c27850..75f2b244dacd3 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -248,9 +248,12 @@ static jl_callptr_t _jl_compile_codeinst( IndirectCodeinsts += emitted.size() - 1; } + size_t i = 0; for (auto &def : emitted) { jl_code_instance_t *this_code = def.first; - jl_timing_show_func_sig(this_code->def->specTypes, JL_TIMING_CURRENT_BLOCK); + if (i < 10) + jl_timing_show_func_sig(this_code->def->specTypes, JL_TIMING_CURRENT_BLOCK); + jl_llvm_functions_t decls = std::get<1>(def.second); jl_callptr_t addr; bool isspecsig = false; @@ -292,7 +295,10 @@ static jl_callptr_t _jl_compile_codeinst( } if (this_code == codeinst) fptr = addr; + i++; } + if (i > 10) + jl_timing_printf(JL_TIMING_CURRENT_BLOCK, "... <%d methods truncated>", i - 10); uint64_t end_time = 0; if (timed) diff --git a/src/timing.h b/src/timing.h index eccec059cf667..d7dc53ad0e8be 100644 --- a/src/timing.h +++ b/src/timing.h @@ -41,6 +41,7 @@ int jl_timing_set_enable(const char *subsystem, uint8_t enabled); #define jl_timing_show_filename(f, b) #define jl_timing_show_method_instance(mi, b) #define jl_timing_show_func_sig(tt, b) +#define jl_timing_printf(s, f, ...) #define jl_timing_block_enter_task(ct, ptls, blk) #define jl_timing_block_exit_task(ct, ptls) ((jl_timing_block_t *)NULL) #define jl_pop_timing_block(blk) From 152653452011f5ad5cf45da3b37e19759f62f3d0 Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Wed, 5 Apr 2023 15:35:41 -0400 Subject: [PATCH 09/12] deps: Upgrade LibTracyClient to 0.9.1 --- deps/libtracyclient.mk | 12 +-- deps/libtracyclient.version | 8 +- .../libTracyClient-no-crash-handler.patch | 21 ----- deps/patches/libTracyClient-no-sampling.patch | 79 +++++++++++++++++++ ...patch => libTracyClient-plot-config.patch} | 15 +++- 5 files changed, 101 insertions(+), 34 deletions(-) delete mode 100644 deps/patches/libTracyClient-no-crash-handler.patch create mode 100644 deps/patches/libTracyClient-no-sampling.patch rename deps/patches/{libTracyClient-config-plots.patch => libTracyClient-plot-config.patch} (73%) diff --git a/deps/libtracyclient.mk b/deps/libtracyclient.mk index 693df1e20fbfb..bf84e25ccefc9 100644 --- a/deps/libtracyclient.mk +++ b/deps/libtracyclient.mk @@ -11,7 +11,7 @@ LIBTRACYCLIENT_CMAKE := LIBTRACYCLIENT_CMAKE += -DBUILD_SHARED_LIBS=ON LIBTRACYCLIENT_CMAKE += -DTRACY_FIBERS=ON LIBTRACYCLIENT_CMAKE += -DTRACY_NO_BROADCAST=ON -LIBTRACYCLIENT_CMAKE += -DTRACY_NO_SYSTEM_TRACING=ON +LIBTRACYCLIENT_CMAKE += -DTRACY_NO_SAMPLING=ON LIBTRACYCLIENT_CMAKE += -DTRACY_ONLY_LOCALHOST=ON LIBTRACYCLIENT_CMAKE += -DTRACY_NO_CODE_TRANSFER=ON LIBTRACYCLIENT_CMAKE += -DTRACY_NO_FRAME_IMAGE=ON @@ -31,17 +31,17 @@ $(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-freebsd-elfw.patch-applied: $(LIBTRACY patch -p1 -f < $(SRCDIR)/patches/libTracyClient-freebsd-elfw.patch echo 1 > $@ -$(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-no-crash-handler.patch-applied: $(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-freebsd-elfw.patch-applied +$(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-no-sampling.patch-applied: $(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-freebsd-elfw.patch-applied cd $(LIBTRACYCLIENT_BUILDDIR) && \ - patch -p1 -f < $(SRCDIR)/patches/libTracyClient-no-crash-handler.patch + patch -p1 -f < $(SRCDIR)/patches/libTracyClient-no-sampling.patch echo 1 > $@ -$(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-config-plots.patch-applied: $(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-no-crash-handler.patch-applied +$(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-plot-config.patch-applied: $(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-no-sampling.patch-applied cd $(LIBTRACYCLIENT_BUILDDIR) && \ - patch -p1 -f < $(SRCDIR)/patches/libTracyClient-config-plots.patch + patch -p1 -f < $(SRCDIR)/patches/libTracyClient-plot-config.patch echo 1 > $@ -$(LIBTRACYCLIENT_BUILDDIR)/build-configured: $(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-config-plots.patch-applied +$(LIBTRACYCLIENT_BUILDDIR)/build-configured: $(LIBTRACYCLIENT_BUILDDIR)/libTracyClient-plot-config.patch-applied mkdir -p $(dir $@) cd $(dir $@) && \ $(CMAKE) . $(CMAKE_GENERATOR_COMMAND) $(CMAKE_COMMON) $(LIBTRACYCLIENT_CMAKE) \ diff --git a/deps/libtracyclient.version b/deps/libtracyclient.version index 52bc74cedb660..3c0e0ebd56fa3 100644 --- a/deps/libtracyclient.version +++ b/deps/libtracyclient.version @@ -1,8 +1,8 @@ ## jll artifact LIBTRACYCLIENT_JLL_NAME := LibTracyClient -LIBTRACYCLIENT_JLL_VER := 0.9.0+1 +LIBTRACYCLIENT_JLL_VER := 0.9.1+0 ## source build -LIBTRACYCLIENT_VER := 0.9.0 -LIBTRACYCLIENT_BRANCH=v0.9 -LIBTRACYCLIENT_SHA1=5a1f5371b792c12aea324213e1dc738b2923ae21 +LIBTRACYCLIENT_VER := 0.9.1 +LIBTRACYCLIENT_BRANCH=v0.9.1 +LIBTRACYCLIENT_SHA1=897aec5b062664d2485f4f9a213715d2e527e0ca diff --git a/deps/patches/libTracyClient-no-crash-handler.patch b/deps/patches/libTracyClient-no-crash-handler.patch deleted file mode 100644 index 259b20fcff650..0000000000000 --- a/deps/patches/libTracyClient-no-crash-handler.patch +++ /dev/null @@ -1,21 +0,0 @@ -diff --git a/public/client/TracyProfiler.cpp b/public/client/TracyProfiler.cpp -index ea168e4f..9287d433 100644 ---- a/public/client/TracyProfiler.cpp -+++ b/public/client/TracyProfiler.cpp -@@ -1454,7 +1454,7 @@ Profiler::~Profiler() - if( m_crashHandlerInstalled ) RemoveVectoredExceptionHandler( m_exceptionHandler ); - #endif - --#ifdef __linux__ -+#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER - if( m_crashHandlerInstalled ) - { - sigaction( TRACY_CRASH_SIGNAL, &m_prevSignal.pwr, nullptr ); -@@ -1520,7 +1520,7 @@ bool Profiler::ShouldExit() - - void Profiler::Worker() - { --#ifdef __linux__ -+#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER - s_profilerTid = syscall( SYS_gettid ); - #endif diff --git a/deps/patches/libTracyClient-no-sampling.patch b/deps/patches/libTracyClient-no-sampling.patch new file mode 100644 index 0000000000000..c4c8576099348 --- /dev/null +++ b/deps/patches/libTracyClient-no-sampling.patch @@ -0,0 +1,79 @@ +commit 6249999153a9497b32bc84e9dc95a1537a0af714 +Author: Cody Tapscott +Date: Tue Apr 4 15:20:46 2023 -0400 + + linux: respect `TRACY_NO_SAMPLING` for sys-tracing + + This compile-time flag was being ignored on Linux. This change adds + gating for software-sampled stack trace sampling following the same + pattern as other `TRACY_NO_SAMPLE_*` options. + + If `TRACY_NO_SAMPLING=1` is provided as an environment variable, + software stack sampling is also disabled. + +diff --git a/public/client/TracySysTrace.cpp b/public/client/TracySysTrace.cpp +index 4a562eaa..af0641fe 100644 +--- a/public/client/TracySysTrace.cpp ++++ b/public/client/TracySysTrace.cpp +@@ -770,6 +770,13 @@ bool SysTraceStart( int64_t& samplingPeriod ) + TracyDebug( "sched_wakeup id: %i\n", wakeupId ); + TracyDebug( "drm_vblank_event id: %i\n", vsyncId ); + ++#ifdef TRACY_NO_SAMPLING ++ const bool noSoftwareSampling = true; ++#else ++ const char* noSoftwareSamplingEnv = GetEnvVar( "TRACY_NO_SAMPLING" ); ++ const bool noSoftwareSampling = noSoftwareSamplingEnv && noSoftwareSamplingEnv[0] == '1'; ++#endif ++ + #ifdef TRACY_NO_SAMPLE_RETIREMENT + const bool noRetirement = true; + #else +@@ -839,28 +846,31 @@ bool SysTraceStart( int64_t& samplingPeriod ) + pe.clockid = CLOCK_MONOTONIC_RAW; + #endif + +- TracyDebug( "Setup software sampling\n" ); +- ProbePreciseIp( pe, currentPid ); +- for( int i=0; i Date: Thu, 13 Apr 2023 13:19:34 -0400 Subject: [PATCH 10/12] deps: bump libtracyclient to v0.9.1+1 This enables broadcast messages so that Tracy processes can be listed by the profiler, which is a nice quality-of-life feature. --- deps/checksums/libtracyclient | 66 +++++++++++++++++------------------ deps/libtracyclient.mk | 1 - deps/libtracyclient.version | 2 +- 3 files changed, 33 insertions(+), 36 deletions(-) diff --git a/deps/checksums/libtracyclient b/deps/checksums/libtracyclient index 506377a3f4e13..55a436b059c42 100644 --- a/deps/checksums/libtracyclient +++ b/deps/checksums/libtracyclient @@ -1,34 +1,32 @@ -LibTracyClient.v0.9.0+1.aarch64-apple-darwin.tar.gz/md5/621591ea1b72b07a0be82f87ed29a456 -LibTracyClient.v0.9.0+1.aarch64-apple-darwin.tar.gz/sha512/d053db12a0256bd60730f9b9a73ed6308c4cecb3f4a31cb979e1ecd8afbec5e3217b4a4f6355e24fc0c3bcc90dc9a83bf1be1dee467544e15ae6597d9d1a8d01 -LibTracyClient.v0.9.0+1.aarch64-linux-gnu.tar.gz/md5/7e2183c4cba6108e39c58e57ba31eb53 -LibTracyClient.v0.9.0+1.aarch64-linux-gnu.tar.gz/sha512/a912d329e065aae7a9d5b4392f6c292b68fed5cbd83b06bfddf925601f84bde4a76993864ecf3750fd216313630632157ff2f3f9e659caa71530e31aa738c72d -LibTracyClient.v0.9.0+1.aarch64-linux-musl.tar.gz/md5/a9d1b9700f9ed3c8c70480da7ebf326d -LibTracyClient.v0.9.0+1.aarch64-linux-musl.tar.gz/sha512/e9e928dda72f0b1aa9a92809f6f8b6c9d3c7e99f30d1944725e6d0eae0eeba34928e0262172f6e1ccd10f99dfb44d2e39537663a4ab72ebb3ce65f8f1b001c13 -LibTracyClient.v0.9.0+1.armv6l-linux-gnueabihf.tar.gz/md5/7c1541edbe31bfb9e43f4ec09a3aa748 -LibTracyClient.v0.9.0+1.armv6l-linux-gnueabihf.tar.gz/sha512/ab8c2502c0fa743538b8929756f283514ee4c69a6fc65555dca7b95021c36ce827ee33e8594d0447f15fa9bd1df873b1a1a75f876989813386f46a803c504c06 -LibTracyClient.v0.9.0+1.armv6l-linux-musleabihf.tar.gz/md5/2904a775192b8bb53c170f28d3588ea0 -LibTracyClient.v0.9.0+1.armv6l-linux-musleabihf.tar.gz/sha512/1b1288619a72e30a1e414295591d93e122c9c478e574e31c09f49e6ee3b665a64a883cd367566cec9ba95abb5fdcc51056d9853400f441ddd0f27a369a20bae3 -LibTracyClient.v0.9.0+1.armv7l-linux-gnueabihf.tar.gz/md5/7773f17dab1acdcb6b9e749dfb04f727 -LibTracyClient.v0.9.0+1.armv7l-linux-gnueabihf.tar.gz/sha512/49b7a433aa9945cfd20702584916ab24cf2e35a67804635c11726576763a09c5f2e578002e175d9ca3e109e29c454b4ad5db2e267ed5aeb002eff45965a74704 -LibTracyClient.v0.9.0+1.armv7l-linux-musleabihf.tar.gz/md5/9c1799102529603793bf180c2fd432ec -LibTracyClient.v0.9.0+1.armv7l-linux-musleabihf.tar.gz/sha512/ab2fcde7a59754b15d36f39e88fddbf1f198e15221680b9cd0dcb7eb43becc498d17ca1763ec576479646f0d4a1947a9b39f340db800e859751105d7d7aa5ed6 -LibTracyClient.v0.9.0+1.i686-linux-gnu.tar.gz/md5/4f64c950d319cdeeec379cef58f41a13 -LibTracyClient.v0.9.0+1.i686-linux-gnu.tar.gz/sha512/9258ec31e5023e7b503c36f1d76d4e6c0b7492abeb177ffe772a64da1d5db6f199031d22956a7702a809b1263b49aef614763816d1a18f5590d0a00b3f6243f1 -LibTracyClient.v0.9.0+1.i686-linux-musl.tar.gz/md5/d5524ddb8c8537b02b737bd7f2a68275 -LibTracyClient.v0.9.0+1.i686-linux-musl.tar.gz/sha512/f149ea48cff01f6cd9da40692eed9900d5b2ff3a3e10e27bb10b62a89034f37f9a68fc080b97d41efb95718472898c8cc6271a8934f907275cde19f44582de08 -LibTracyClient.v0.9.0+1.i686-w64-mingw32.tar.gz/md5/c415459220b24c0a67553e4691801639 -LibTracyClient.v0.9.0+1.i686-w64-mingw32.tar.gz/sha512/57c8826be9fb049fa418a72dc8fbf576b6fbf45da1662f07ed041b9e55c36e487c02c43a1e64003d76a0040f0e998201e8b0d3853960023ea440a2daadcb4f77 -LibTracyClient.v0.9.0+1.powerpc64le-linux-gnu.tar.gz/md5/5eacaa3672522f45733595182ba643fc -LibTracyClient.v0.9.0+1.powerpc64le-linux-gnu.tar.gz/sha512/4edf154a9ac126fe31879b7962af127d99e5afd19dc046275ddb26b9f455431fd6fd398373a01d6f8865b090cb87ed521a5919b8037d569c569c36c7a8a2f72f -LibTracyClient.v0.9.0+1.x86_64-apple-darwin.tar.gz/md5/bb517fdccbf51c7f0889919735889d65 -LibTracyClient.v0.9.0+1.x86_64-apple-darwin.tar.gz/sha512/a3587248776c859e60d367e91908e36fd9f7fd3e27320dfac2c7527ee120c1a2652b2da1c0c1a006d2c28c7f93992dd4a3b2565e7f2c5feec6a5661cc4cf080e -LibTracyClient.v0.9.0+1.x86_64-linux-gnu.tar.gz/md5/2dd1cbcf917c7df9df110b99f393b916 -LibTracyClient.v0.9.0+1.x86_64-linux-gnu.tar.gz/sha512/3d0dc4cd3df2528678a0305baa23d5cda97406f73a3def3ff2fd8ee2517f07051e19719faf99604fddb3aa5b20574b92b240f7898d392d9e21431f275c0a8aa8 -LibTracyClient.v0.9.0+1.x86_64-linux-musl.tar.gz/md5/d1b02aaf45f13ba34f4f1138b83f8ce7 -LibTracyClient.v0.9.0+1.x86_64-linux-musl.tar.gz/sha512/7c8a3748238d015de4be7074c1efe72b2cda9dbc23c2ab722750885cd01cd4a6ea6e37b241fc997d41ab3949154b4a5bddbfd8f3a59ca153e9b42136a154a02a -LibTracyClient.v0.9.0+1.x86_64-unknown-freebsd.tar.gz/md5/7bb6f98ab2a39a062293c95f91b959f0 -LibTracyClient.v0.9.0+1.x86_64-unknown-freebsd.tar.gz/sha512/72935612fbfb339003b9be38c64a53c6a19a58b8427485b4351f18933a2ec7a4f7bf00556996501ccd3857e8085910af72020e4507951eb8ee094287a82208ae -LibTracyClient.v0.9.0+1.x86_64-w64-mingw32.tar.gz/md5/f3b60a51e8f64ec62c07597f6c4e52f7 -LibTracyClient.v0.9.0+1.x86_64-w64-mingw32.tar.gz/sha512/3ef5916f9a441e8655569c803392987a39c3baa79ac9ac446760cc60577619a616ee1365673d5323eb1c5884a6bd9e283b4094cdcbf42eba6b409a0348643b25 -libtracyclient-5a1f5371b792c12aea324213e1dc738b2923ae21.tar.gz/md5/62791801e0ffb11a7d70c2d724a230be -libtracyclient-5a1f5371b792c12aea324213e1dc738b2923ae21.tar.gz/sha512/b0570e048eee08ba5e21e0f855b2346cad408b0b86cdf79c8bb3727bb0ab57167dc7988f4dd1ee4157b371135201b87d1491237c09a2934de65eb3b9e26fcdc2 +LibTracyClient.v0.9.1+1.aarch64-apple-darwin.tar.gz/md5/540617535443c918d42415d7e775456d +LibTracyClient.v0.9.1+1.aarch64-apple-darwin.tar.gz/sha512/5dc245327900a26f20692c76c6a3043a07ee88010b814bdded79460fd77cd587b69448b074a1afc931290ef7f445771aec71a003d6e425d42c75d2cc72bdf846 +LibTracyClient.v0.9.1+1.aarch64-linux-gnu.tar.gz/md5/d2a09ad722a1f15090dd0ae6ce9c37c7 +LibTracyClient.v0.9.1+1.aarch64-linux-gnu.tar.gz/sha512/b5e6f44bb4690226dd4176a43824193c7e1a7873cf75c2e261b6cb0a614aad172c0265b6aa89849328133de9894af94a4a38b4362ec8d706d03a0cad4fd1171a +LibTracyClient.v0.9.1+1.aarch64-linux-musl.tar.gz/md5/eccc851b7346590d2636ff585e4b1f55 +LibTracyClient.v0.9.1+1.aarch64-linux-musl.tar.gz/sha512/214dd6d7ce70ce11748091143a2e89dfc6b85c62424d971eb973b1126ee3da98d8285c2f5557c7b62523f76e513692947b5ef0f046bdf183da3ddd38554b4a97 +LibTracyClient.v0.9.1+1.armv6l-linux-gnueabihf.tar.gz/md5/200b940fb4b6c7f8cb6c621ae4bab347 +LibTracyClient.v0.9.1+1.armv6l-linux-gnueabihf.tar.gz/sha512/1213b716ed3680bb8b1a682099ef325a257e29811498a731553c4d6fc8f93831038d211720da1985f72f42c6409ea5e2aa262493557abecb6587d7db69bde737 +LibTracyClient.v0.9.1+1.armv6l-linux-musleabihf.tar.gz/md5/44dddf9ef55cd9d222a16eff2b2e14e7 +LibTracyClient.v0.9.1+1.armv6l-linux-musleabihf.tar.gz/sha512/ba887e97366e9ac9dbc43864b3d8cd8cdf2db571fb198593f2ae66790fb9cd5a12d4c29088a65bc103939ec029fa426925a0990c0a2b1441fece974b3dabce6c +LibTracyClient.v0.9.1+1.armv7l-linux-gnueabihf.tar.gz/md5/286bbb5c258fcd38224ff03a691cf474 +LibTracyClient.v0.9.1+1.armv7l-linux-gnueabihf.tar.gz/sha512/26e85ec00a794901d412bb54d1ea1cbd9c7972e2dcc6fbcad12d520088c4d6d86a8ff7398cff14e60741abb30028fcda39515b2a1ae1a225a3192e9e956a8641 +LibTracyClient.v0.9.1+1.armv7l-linux-musleabihf.tar.gz/md5/36b317542174c07c4e85fdffcf5e34c7 +LibTracyClient.v0.9.1+1.armv7l-linux-musleabihf.tar.gz/sha512/d9e28e6ebb4537ae4de75ae03bb28d170c7dc92731f6a8d6431ce2e5ee5ad717a04990a42b57898247a5059b8e0d93b5812d6ffc66c94c5571adec2ecfe0555d +LibTracyClient.v0.9.1+1.i686-linux-gnu.tar.gz/md5/1990c1f0701a3c7853fa57e54d214465 +LibTracyClient.v0.9.1+1.i686-linux-gnu.tar.gz/sha512/f899f109ad77f2b1964e94242b0a601128b063140190105fd44e43782036ef0134cdc2b6cb1faaf5b7c0742f4a168c7b7870f18d2d18b19fc94d8f269f3027d1 +LibTracyClient.v0.9.1+1.i686-linux-musl.tar.gz/md5/eeee122d2166e8a251bcee40d66daede +LibTracyClient.v0.9.1+1.i686-linux-musl.tar.gz/sha512/fcf0de849b8533065e61d5e1528cc1e882d2734c488a030728ec4915651bb2bd049536d9d76ecce3063647d0cd20e10033beb3d8de82e06c9c73e9eb41b12b03 +LibTracyClient.v0.9.1+1.i686-w64-mingw32.tar.gz/md5/0f42ad75bb75084e129b0e6fe5e86196 +LibTracyClient.v0.9.1+1.i686-w64-mingw32.tar.gz/sha512/28821145c8d3d7d8dc3e1db883478e53b4eacafa5f4deae965057ad6466c683de6bd9265a92f1d63ab587107fbb374f7167ff4be70c5fbdf09db9b57fb619b3e +LibTracyClient.v0.9.1+1.powerpc64le-linux-gnu.tar.gz/md5/1a2243ac3a4efa224da1eaace7aa9278 +LibTracyClient.v0.9.1+1.powerpc64le-linux-gnu.tar.gz/sha512/acd17f12afb523348de56f7fa84497468ec5f2f76d9622180e72bded7eb4eda4033e28ed20cb25c7c8049b086c994c10a6d97efae06a661ce7d0d65e5c8bbfd5 +LibTracyClient.v0.9.1+1.x86_64-apple-darwin.tar.gz/md5/34a75343f9aed8e6db252cc043b26b09 +LibTracyClient.v0.9.1+1.x86_64-apple-darwin.tar.gz/sha512/2eaf4e5ef1959110cecaf467bdc28e92d45862c4d83315fccafbf3ab4e498918f4d715b0303be4e563ac7ddb88c9d9cec71351183c5ec0055759a86341473232 +LibTracyClient.v0.9.1+1.x86_64-linux-gnu.tar.gz/md5/d6fcd4255ab1363412e85497770ab522 +LibTracyClient.v0.9.1+1.x86_64-linux-gnu.tar.gz/sha512/a879afb13a35d18c8ed5593cf83d48ac228c45c246fa013f5f067fa216b61d2dc5b8adfc4ced6043c331b982d050522b228a5792c707b9deff2fb65b37aa66ea +LibTracyClient.v0.9.1+1.x86_64-linux-musl.tar.gz/md5/0490919c558c5ae8c833934426e0dda4 +LibTracyClient.v0.9.1+1.x86_64-linux-musl.tar.gz/sha512/b37637e8530ad9b3cb58732bb35d9634aadaeb5a83815f602d5804d4fff1499ea49ce72300b03036ecd642d0dbd8f86d475b637673bc8d400f70e4cb4cf865eb +LibTracyClient.v0.9.1+1.x86_64-unknown-freebsd.tar.gz/md5/985a19b60f44349870c304c7a140cad8 +LibTracyClient.v0.9.1+1.x86_64-unknown-freebsd.tar.gz/sha512/514f01127dcc641ab4b66fac49c0470f6277bff37fbd82084482d3db72e1964d85655ca8aa135dbe08265d711d857ed861eba038a072f8e4fcffeefe3b11808d +LibTracyClient.v0.9.1+1.x86_64-w64-mingw32.tar.gz/md5/a777f0997a238c3f28a362e2998d92a2 +LibTracyClient.v0.9.1+1.x86_64-w64-mingw32.tar.gz/sha512/3aa49b49f696792d20265e9947b9a0dc4b888a482617513252176b0ac59db9069c965f01a8c1c253f73050eab3344d9d0b4c26a826ff9bfa92e36eb42814444d diff --git a/deps/libtracyclient.mk b/deps/libtracyclient.mk index bf84e25ccefc9..19c08f774aa0c 100644 --- a/deps/libtracyclient.mk +++ b/deps/libtracyclient.mk @@ -10,7 +10,6 @@ LIBTRACYCLIENT_SRCCACHE := $(SRCCACHE)/$(LIBTRACYCLIENT_SRC_DIR) LIBTRACYCLIENT_CMAKE := LIBTRACYCLIENT_CMAKE += -DBUILD_SHARED_LIBS=ON LIBTRACYCLIENT_CMAKE += -DTRACY_FIBERS=ON -LIBTRACYCLIENT_CMAKE += -DTRACY_NO_BROADCAST=ON LIBTRACYCLIENT_CMAKE += -DTRACY_NO_SAMPLING=ON LIBTRACYCLIENT_CMAKE += -DTRACY_ONLY_LOCALHOST=ON LIBTRACYCLIENT_CMAKE += -DTRACY_NO_CODE_TRANSFER=ON diff --git a/deps/libtracyclient.version b/deps/libtracyclient.version index 3c0e0ebd56fa3..2faa4327b7749 100644 --- a/deps/libtracyclient.version +++ b/deps/libtracyclient.version @@ -1,6 +1,6 @@ ## jll artifact LIBTRACYCLIENT_JLL_NAME := LibTracyClient -LIBTRACYCLIENT_JLL_VER := 0.9.1+0 +LIBTRACYCLIENT_JLL_VER := 0.9.1+1 ## source build LIBTRACYCLIENT_VER := 0.9.1 From 8e0cba566348e74a9451d0af23b0488afc678597 Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Thu, 13 Apr 2023 13:54:23 -0400 Subject: [PATCH 11/12] timing: add envvars for subsystem enable and verbose metadata This commit adds the following environment variables: - `JULIA_TIMING_SUBSYSTEMS` which can be set to, e.g., "+INFERENCE,-GC,METHOD_MATCH" to enable INFERENCE and METHOD_MATCH and disable GC. - `JULIA_TIMING_METADATA_PRINT_LIMIT` which defaults to 10 and determines how many metadata items to add to a single timing zone before truncating This commit also includes other miscellaneous changes to incorporate review feedback. --- src/jitlayers.cpp | 4 +-- src/jlapi.c | 7 ++-- src/task.c | 2 -- src/timing.c | 81 +++++++++++++++++++++++++++++++++++++++++++---- src/timing.h | 15 +++++++++ 5 files changed, 96 insertions(+), 13 deletions(-) diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 75f2b244dacd3..2ca97289f62d9 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -251,7 +251,7 @@ static jl_callptr_t _jl_compile_codeinst( size_t i = 0; for (auto &def : emitted) { jl_code_instance_t *this_code = def.first; - if (i < 10) + if (i < jl_timing_print_limit) jl_timing_show_func_sig(this_code->def->specTypes, JL_TIMING_CURRENT_BLOCK); jl_llvm_functions_t decls = std::get<1>(def.second); @@ -297,7 +297,7 @@ static jl_callptr_t _jl_compile_codeinst( fptr = addr; i++; } - if (i > 10) + if (i > jl_timing_print_limit) jl_timing_printf(JL_TIMING_CURRENT_BLOCK, "... <%d methods truncated>", i - 10); uint64_t end_time = 0; diff --git a/src/jlapi.c b/src/jlapi.c index 8f5e3e6cb13dc..369f4d6ec3ff1 100644 --- a/src/jlapi.c +++ b/src/jlapi.c @@ -685,8 +685,11 @@ static void rr_detach_teleport(void) { JL_DLLEXPORT int jl_repl_entrypoint(int argc, char *argv[]) { #ifdef USE_TRACY - if (getenv("WAIT_FOR_TRACY")) - while (!TracyCIsConnected) ; // Wait for connection + // Apply e.g. JULIA_TIMING_SUBSYSTEMS="+GC,-INFERENCE" and + // JULIA_TIMING_METADATA_PRINT_LIMIT=20 + jl_timing_apply_env(); + if (getenv("JULIA_WAIT_FOR_TRACY")) + while (!TracyCIsConnected) jl_cpu_pause(); // Wait for connection #endif // no-op on Windows, note that the caller must have already converted diff --git a/src/task.c b/src/task.c index e73b19563e336..123cfaac00163 100644 --- a/src/task.c +++ b/src/task.c @@ -1222,8 +1222,6 @@ CFI_NORETURN _start_task(); } -const char* fiber = "task"; - STATIC_OR_JS void NOINLINE JL_NORETURN _start_task(void) { CFI_NORETURN diff --git a/src/timing.c b/src/timing.c index c3ba3809884bf..7337902010a3d 100644 --- a/src/timing.c +++ b/src/timing.c @@ -36,6 +36,11 @@ JL_DLLEXPORT uint64_t jl_timing_enable_mask = 0xFFFFFFFFFFFFFFFF; #endif JL_DLLEXPORT uint64_t jl_timing_counts[(int)JL_TIMING_LAST] = {0}; + +// Used to as an item limit when several strings of metadata can +// potentially be associated with a single timing zone. +JL_DLLEXPORT uint32_t jl_timing_print_limit = 10; + const char *jl_timing_names[(int)JL_TIMING_LAST] = { #define X(name) #name @@ -100,14 +105,16 @@ void jl_timing_block_enter_task(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t jl_timing_block_t *jl_timing_block_exit_task(jl_task_t *ct, jl_ptls_t ptls) { #ifdef USE_TRACY - // Tracy is fairly strict about not leaving a fiber that - // hasn't been entered, which happens often when - // connecting to a running Julia session. + // Tracy is fairly strict about not leaving a fiber that hasn't + // been entered, which happens often when connecting to a running + // Julia session. + // + // Eventually, Tracy will support telling the server which fibers + // are active upon connection, but until then we work around the + // problem by not explicitly leaving the fiber at all. // - // Eventually, Tracy will support telling the server that - // which fibers are active upon connection, but until then - // work around around the problem by just entering the new - // fiber directly, which implicitly leaves any active fibers. + // Later when we enter the new fiber directly, that will cause the + // the active fiber to be left implicitly. //TracyCFiberLeave; #endif @@ -220,11 +227,71 @@ JL_DLLEXPORT int jl_timing_set_enable(const char *subsystem, uint8_t enabled) return -1; } +static void jl_timing_set_enable_from_env(void) +{ + const char *env = getenv("JULIA_TIMING_SUBSYSTEMS"); + if (!env) + return; + + // Copy `env`, so that we can modify it + size_t sz = strlen(env) + 1; + char *env_copy = (char *)malloc(sz); + memcpy(env_copy, env, sz); + + char *subsystem = env_copy; + char *ch = subsystem; + uint8_t enable = 1; + while (1) { + // +SUBSYSTEM means enable, -SUBSYSTEM means disable + if (*subsystem == '+' || *subsystem == '-') + enable = (*subsystem++ == '+'); + + if (*ch == ',') { + *ch++ = '\0'; + if ((*subsystem != '\0') && jl_timing_set_enable(subsystem, enable)) + fprintf(stderr, "warning: unable to configure timing for non-existent subsystem \"%s\"\n", subsystem); + + subsystem = ch; + enable = 1; + } + else if (*ch == '\0') { + if ((*subsystem != '\0') && jl_timing_set_enable(subsystem, enable)) + fprintf(stderr, "warning: unable to configure timing for non-existent subsystem \"%s\"\n", subsystem); + + break; + } + else ch++; + } + free(env_copy); +} + +static void jl_timing_set_print_limit_from_env(void) +{ + const char *const env = getenv("JULIA_TIMING_METADATA_PRINT_LIMIT"); + if (!env) + return; + + char *endp; + long value = strtol(env, &endp, 10); + if (*endp == '\0' && value >= 0 && value <= UINT32_MAX) + jl_timing_print_limit = (uint32_t)value; +} + +void jl_timing_apply_env(void) +{ + // JULIA_TIMING_SUBSYSTEMS + jl_timing_set_enable_from_env(); + + // JULIA_TIMING_METADATA_PRINT_LIMIT + jl_timing_set_print_limit_from_env(); +} + #else void jl_init_timing(void) { } void jl_destroy_timing(void) { } JL_DLLEXPORT int jl_timing_set_enable(const char *subsystem, uint8_t enabled) { return -1; } +JL_DLLEXPORT uint32_t jl_timing_print_limit = 0; #endif diff --git a/src/timing.h b/src/timing.h index d7dc53ad0e8be..d6e4c6d80ab63 100644 --- a/src/timing.h +++ b/src/timing.h @@ -16,6 +16,21 @@ void jl_destroy_timing(void) JL_NOTSAFEPOINT; // Returns -1 if no matching sub-system was found. int jl_timing_set_enable(const char *subsystem, uint8_t enabled); +// Check for environment vars "JULIA_TIMING_METADATA_PRINT_LIMIT" and +// "JULIA_TIMING_SUBSYSTEMS" and if present apply these to the metadata +// print limit and the timings enable mask, respectively. +// +// For example, to enable INFERENCE and METHOD_MATCH and disable GC: +// JULIA_TIMING_SUBSYSTEMS="+INFERENCE,-GC,+METHOD_MATCH" +// +// For example, to increase the metadata item print limit from 10 to 20: +// JULIA_TIMING_METADATA_PRINT_LIMIT=20 +void jl_timing_apply_env(void); + +// Configurable item limit, runtime code should use this to limit printing +// when adding potentially many items of metadata to a single timing zone. +extern uint32_t jl_timing_print_limit; + #ifdef __cplusplus } #endif From 6ac57e6b8b392a574ef4e9f996c2d6924d238a95 Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Mon, 17 Apr 2023 12:10:52 -0400 Subject: [PATCH 12/12] profiling: delete `JULIA_WAIT_FOR_TRACY` when pre-compiling We'll eventually want to provide some kind of useful interface to turn this on for a particular pre-compilation, but for now we should just prevent causing pre-compilation to hang forever. --- base/loading.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/base/loading.jl b/base/loading.jl index b82028216663b..df5dde36d371c 100644 --- a/base/loading.jl +++ b/base/loading.jl @@ -2109,6 +2109,7 @@ function create_expr_cache(pkg::PkgId, input::String, output::String, output_o:: $trace -`, "OPENBLAS_NUM_THREADS" => 1, + "JULIA_WAIT_FOR_TRACY" => nothing, "JULIA_NUM_THREADS" => 1), stderr = internal_stderr, stdout = internal_stdout), "w", stdout)