From 6c365415864269b328b0f4180d4effabe98bada3 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Mon, 20 Dec 2021 23:13:24 -0500 Subject: [PATCH 1/4] Remove file println debugging --- stdlib/AllocProfile/src/AllocProfile.jl | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/stdlib/AllocProfile/src/AllocProfile.jl b/stdlib/AllocProfile/src/AllocProfile.jl index 3a5e74eca8f8f..7ec6c3d700a89 100644 --- a/stdlib/AllocProfile/src/AllocProfile.jl +++ b/stdlib/AllocProfile/src/AllocProfile.jl @@ -100,36 +100,29 @@ end function decode(raw_results::RawAllocResults)::AllocResults cache = BacktraceCache() + @info "ALLOCS" allocs = [ decode_alloc(cache, unsafe_load(raw_results.allocs, i)) for i in 1:raw_results.num_allocs ] + @info "FREES" frees = Dict{Type,UInt}() for i in 1:raw_results.num_frees free = unsafe_load(raw_results.frees, i) type = load_type(free.type) frees[type] = free.count end - + return AllocResults( allocs, frees ) end -const f = Ref{IOStream}() - -function __init__() - f[] = open("debug.log", "w") -end - function load_backtrace(trace::RawBacktrace)::Vector{Ptr{Cvoid}} - println(f[], "load_backtrace: trace.data: $(trace.data)") - println(f[], "load_backtrace: trace.size: $(trace.size)") out = Vector{Ptr{Cvoid}}() for i in 1:trace.size - println(f[], " $i") push!(out, unsafe_load(trace.data, i)) end From 8da838489db2f6226fc11477f77730aba831dbc8 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Mon, 20 Dec 2021 23:15:57 -0500 Subject: [PATCH 2/4] Add precompile statements to AllocProfile package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This _drastically_ speeds up the tests, for reasons I don't exactly understand.. I wonder if it was messing up some heuristics and deciding to interpret the code instead of compiling it, and had some weird corneer cases in the interpreted code or something? I dunno! But anyway, this drastically speeds it up, so 🤷 sounds like not our problem 😊 --- stdlib/AllocProfile/src/AllocProfile.jl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/stdlib/AllocProfile/src/AllocProfile.jl b/stdlib/AllocProfile/src/AllocProfile.jl index 7ec6c3d700a89..cfd0b45dcdf96 100644 --- a/stdlib/AllocProfile/src/AllocProfile.jl +++ b/stdlib/AllocProfile/src/AllocProfile.jl @@ -151,4 +151,12 @@ function stacktrace_memoized( return stack end +precompile(start, ()) +precompile(stop, ()) + +function __init__() + precompile(start, ()) + precompile(stop, ()) +end + end From e314827551c95feff0af84d304378e7f3689a0a3 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Mon, 20 Dec 2021 23:21:11 -0500 Subject: [PATCH 3/4] add comment about precompilation --- stdlib/AllocProfile/src/AllocProfile.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/stdlib/AllocProfile/src/AllocProfile.jl b/stdlib/AllocProfile/src/AllocProfile.jl index cfd0b45dcdf96..da31d99ee28d6 100644 --- a/stdlib/AllocProfile/src/AllocProfile.jl +++ b/stdlib/AllocProfile/src/AllocProfile.jl @@ -151,10 +151,16 @@ function stacktrace_memoized( return stack end +# Precompile once for the package cache, precompile(start, ()) precompile(stop, ()) function __init__() + # And once when loading the package, to get the full machine code precompiled. + # TOOD: Although actually, we probably don't need this since this package will be + # precompiled into the sysimg, so the top-level statements will be enough to get the + # machine code codegen precompiled as well. :) + # We can delete this function once we make this package a stdlib. precompile(start, ()) precompile(stop, ()) end From cae4480288effcf504eb243173aded97fde9b24a Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Tue, 21 Dec 2021 10:33:24 -0500 Subject: [PATCH 4/4] Malloc right-sized buffers for backtraces. Instead of allocating a maximum-sized buffer for each backtrace, we keep a single max-sized buffer as a scratch space, write the backtrace to it, and then once we know the size, we allocate a right-sized buffer for the backtrace and copy it over. Benchmark results (measured time for profiling allocations on internal Arroyo benchmark, with `skip_every=0`): This only slightly improves the time to record an alloctions profile: - Before: 275.082525 seconds - After: 245.891006 seconds But it drastically improves the memory usage once the profiling is completed, according to System Activity Monitor: - Before: 17.35 GB - After: 6.92 GB - (Compared to 350 MB for the same task without profiling) We could probably slightly improve the time overhead still furthur by using a single big vector instead of a bunch of individual allocated buffers, but this is probably about the best we could do in terms of space usage. This would allow us to eliminate the redundant copying, and would also amortize away the allocations of the buffers, both of which should reduce the performance impact. But I'm guessing the time is mostly dominated by just how long the stack traces are, and there's no getting around that. At best, we could expect maybe like a 2x-3x improvement from those changes, I think. --- src/gc-alloc-profiler.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/gc-alloc-profiler.cpp b/src/gc-alloc-profiler.cpp index 1497bc7ff3ed5..8d60124adcf9f 100644 --- a/src/gc-alloc-profiler.cpp +++ b/src/gc-alloc-profiler.cpp @@ -53,10 +53,14 @@ CombinedResults g_combined_results; // will live forever // === stack stuff === RawBacktrace get_raw_backtrace() { - jl_bt_element_t *bt_data = (jl_bt_element_t*) malloc(JL_MAX_BT_SIZE); + static jl_bt_element_t static_bt_data[JL_MAX_BT_SIZE]; // TODO: tune the number of frames that are skipped - size_t bt_size = rec_backtrace(bt_data, JL_MAX_BT_SIZE, 1); + size_t bt_size = rec_backtrace(static_bt_data, JL_MAX_BT_SIZE, 1); + + size_t bt_bytes = bt_size * sizeof(jl_bt_element_t); + jl_bt_element_t *bt_data = (jl_bt_element_t*) malloc(bt_bytes); + memcpy(bt_data, static_bt_data, bt_bytes); return RawBacktrace{ bt_data,