From 6c365415864269b328b0f4180d4effabe98bada3 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nhdaly@gmail.com>
Date: Mon, 20 Dec 2021 23:13:24 -0500
Subject: [PATCH 1/4] Remove file println debugging

---
 stdlib/AllocProfile/src/AllocProfile.jl | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/stdlib/AllocProfile/src/AllocProfile.jl b/stdlib/AllocProfile/src/AllocProfile.jl
index 3a5e74eca8f8f..7ec6c3d700a89 100644
--- a/stdlib/AllocProfile/src/AllocProfile.jl
+++ b/stdlib/AllocProfile/src/AllocProfile.jl
@@ -100,36 +100,29 @@ end
 
 function decode(raw_results::RawAllocResults)::AllocResults
     cache = BacktraceCache()
+    @info "ALLOCS"
     allocs = [
         decode_alloc(cache, unsafe_load(raw_results.allocs, i))
         for i in 1:raw_results.num_allocs
     ]
 
+    @info "FREES"
     frees = Dict{Type,UInt}()
     for i in 1:raw_results.num_frees
         free = unsafe_load(raw_results.frees, i)
         type = load_type(free.type)
         frees[type] = free.count
     end
-    
+
     return AllocResults(
         allocs,
         frees
     )
 end
 
-const f = Ref{IOStream}()
-
-function __init__()
-    f[] = open("debug.log", "w")
-end
-
 function load_backtrace(trace::RawBacktrace)::Vector{Ptr{Cvoid}}
-    println(f[], "load_backtrace: trace.data: $(trace.data)")
-    println(f[], "load_backtrace: trace.size: $(trace.size)")
     out = Vector{Ptr{Cvoid}}()
     for i in 1:trace.size
-        println(f[], "  $i")
         push!(out, unsafe_load(trace.data, i))
     end
 

From 8da838489db2f6226fc11477f77730aba831dbc8 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nhdaly@gmail.com>
Date: Mon, 20 Dec 2021 23:15:57 -0500
Subject: [PATCH 2/4] Add precompile statements to AllocProfile package
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This _drastically_ speeds up the tests, for reasons I don't exactly
understand.. I wonder if it was messing up some heuristics and deciding
to interpret the code instead of compiling it, and had some weird
corneer cases in the interpreted code or something? I dunno!

But anyway, this drastically speeds it up, so 🤷 sounds like not our
problem 😊
---
 stdlib/AllocProfile/src/AllocProfile.jl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/stdlib/AllocProfile/src/AllocProfile.jl b/stdlib/AllocProfile/src/AllocProfile.jl
index 7ec6c3d700a89..cfd0b45dcdf96 100644
--- a/stdlib/AllocProfile/src/AllocProfile.jl
+++ b/stdlib/AllocProfile/src/AllocProfile.jl
@@ -151,4 +151,12 @@ function stacktrace_memoized(
     return stack
 end
 
+precompile(start, ())
+precompile(stop, ())
+
+function __init__()
+    precompile(start, ())
+    precompile(stop, ())
+end
+
 end

From e314827551c95feff0af84d304378e7f3689a0a3 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nhdaly@gmail.com>
Date: Mon, 20 Dec 2021 23:21:11 -0500
Subject: [PATCH 3/4] add comment about precompilation

---
 stdlib/AllocProfile/src/AllocProfile.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/stdlib/AllocProfile/src/AllocProfile.jl b/stdlib/AllocProfile/src/AllocProfile.jl
index cfd0b45dcdf96..da31d99ee28d6 100644
--- a/stdlib/AllocProfile/src/AllocProfile.jl
+++ b/stdlib/AllocProfile/src/AllocProfile.jl
@@ -151,10 +151,16 @@ function stacktrace_memoized(
     return stack
 end
 
+# Precompile once for the package cache,
 precompile(start, ())
 precompile(stop, ())
 
 function __init__()
+    # And once when loading the package, to get the full machine code precompiled.
+    # TOOD: Although actually, we probably don't need this since this package will be
+    # precompiled into the sysimg, so the top-level statements will be enough to get the
+    # machine code codegen precompiled as well. :)
+    # We can delete this function once we make this package a stdlib.
     precompile(start, ())
     precompile(stop, ())
 end

From cae4480288effcf504eb243173aded97fde9b24a Mon Sep 17 00:00:00 2001
From: Nathan Daly <nhdaly@gmail.com>
Date: Tue, 21 Dec 2021 10:33:24 -0500
Subject: [PATCH 4/4] Malloc right-sized buffers for backtraces.

Instead of allocating a maximum-sized buffer for each backtrace, we keep
a single max-sized buffer as a scratch space, write the backtrace to it,
and then once we know the size, we allocate a right-sized buffer for the
backtrace and copy it over.

Benchmark results (measured time for profiling allocations on internal
Arroyo benchmark, with `skip_every=0`):

This only slightly improves the time to record an alloctions profile:
- Before: 275.082525 seconds
- After: 245.891006 seconds

But it drastically improves the memory usage once the profiling is
completed, according to System Activity Monitor:
- Before: 17.35 GB
- After: 6.92 GB
- (Compared to 350 MB for the same task without profiling)

We could probably slightly improve the time overhead still furthur by
using a single big vector instead of a bunch of individual allocated
buffers, but this is probably about the best we could do in terms of
space usage. This would allow us to eliminate the redundant copying, and
would also amortize away the allocations of the buffers, both of which
should reduce the performance impact. But I'm guessing the time is
mostly dominated by just how long the stack traces are, and there's no
getting around that. At best, we could expect maybe like a 2x-3x
improvement from those changes, I think.
---
 src/gc-alloc-profiler.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gc-alloc-profiler.cpp b/src/gc-alloc-profiler.cpp
index 1497bc7ff3ed5..8d60124adcf9f 100644
--- a/src/gc-alloc-profiler.cpp
+++ b/src/gc-alloc-profiler.cpp
@@ -53,10 +53,14 @@ CombinedResults g_combined_results; // will live forever
 // === stack stuff ===
 
 RawBacktrace get_raw_backtrace() {
-    jl_bt_element_t *bt_data = (jl_bt_element_t*) malloc(JL_MAX_BT_SIZE);
+    static jl_bt_element_t static_bt_data[JL_MAX_BT_SIZE];
 
     // TODO: tune the number of frames that are skipped
-    size_t bt_size = rec_backtrace(bt_data, JL_MAX_BT_SIZE, 1);
+    size_t bt_size = rec_backtrace(static_bt_data, JL_MAX_BT_SIZE, 1);
+
+    size_t bt_bytes = bt_size * sizeof(jl_bt_element_t);
+    jl_bt_element_t *bt_data = (jl_bt_element_t*) malloc(bt_bytes);
+    memcpy(bt_data, static_bt_data, bt_bytes);
 
     return RawBacktrace{
         bt_data,