Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[scratch] Scratch PR for backporting the allocations profiler to julia 1.7 (we're targeting this branch in our build at RelationalAI) #44119

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ RUNTIME_SRCS := \
jltypes gf typemap smallintset ast builtins module interpreter symbol \
dlload sys init task array dump staticdata toplevel jl_uv datatype \
simplevector runtime_intrinsics precompile \
threading partr stackwalk gc gc-debug gc-pages gc-stacks method \
threading partr stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler method \
jlapi signal-handling safepoint timing subtype \
crc32c APInt-C processor ircode opaque_closure
SRCS := jloptions runtime_ccall rtutils
Expand Down
148 changes: 148 additions & 0 deletions src/gc-alloc-profiler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// This file is a part of Julia. License is MIT: https://julialang.org/license

#include "gc-alloc-profiler.h"

#include "julia_internal.h"
#include "gc.h"

#include <string>
#include <vector>

using std::string;
using std::vector;

struct jl_raw_backtrace_t {
jl_bt_element_t *data;
size_t size;
};

struct jl_raw_alloc_t {
jl_datatype_t *type_address;
jl_raw_backtrace_t backtrace;
size_t size;
};

// == These structs define the global singleton profile buffer that will be used by
// callbacks to store profile results. ==
struct jl_per_thread_alloc_profile_t {
vector<jl_raw_alloc_t> allocs;
};

struct jl_alloc_profile_t {
double sample_rate;

vector<jl_per_thread_alloc_profile_t> per_thread_profiles;
};

struct jl_combined_results {
vector<jl_raw_alloc_t> combined_allocs;
};

// == Global variables manipulated by callbacks ==

jl_alloc_profile_t g_alloc_profile;
int g_alloc_profile_enabled = false;
jl_combined_results g_combined_results; // Will live forever.

// === stack stuff ===

jl_raw_backtrace_t get_raw_backtrace() JL_NOTSAFEPOINT {
// We first record the backtrace onto a MAX-sized buffer, so that we don't have to
// allocate the buffer until we know the size. To ensure thread-safety, we use a
// per-thread backtrace buffer.
jl_ptls_t ptls = jl_current_task->ptls;
jl_bt_element_t *shared_bt_data_buffer = ptls->profiling_bt_buffer;
if (shared_bt_data_buffer == NULL) {
size_t size = sizeof(jl_bt_element_t) * (JL_MAX_BT_SIZE + 1);
shared_bt_data_buffer = (jl_bt_element_t*) malloc_s(size);
ptls->profiling_bt_buffer = shared_bt_data_buffer;
}

size_t bt_size = rec_backtrace(shared_bt_data_buffer, JL_MAX_BT_SIZE, 2);

// Then we copy only the needed bytes out of the buffer into our profile.
size_t bt_bytes = bt_size * sizeof(jl_bt_element_t);
jl_bt_element_t *bt_data = (jl_bt_element_t*) malloc_s(bt_bytes);
memcpy(bt_data, shared_bt_data_buffer, bt_bytes);


return jl_raw_backtrace_t{
bt_data,
bt_size
};
}

// == exported interface ==

extern "C" { // Needed since these functions doesn't take any arguments.

JL_DLLEXPORT void jl_start_alloc_profile(double sample_rate) {
// We only need to do this once, the first time this is called.
while (g_alloc_profile.per_thread_profiles.size() < jl_n_threads) {
g_alloc_profile.per_thread_profiles.push_back(jl_per_thread_alloc_profile_t{});
}

g_alloc_profile.sample_rate = sample_rate;
g_alloc_profile_enabled = true;
}

JL_DLLEXPORT jl_profile_allocs_raw_results_t jl_fetch_alloc_profile() {
// combine allocs
// TODO: interleave to preserve ordering
for (auto& profile : g_alloc_profile.per_thread_profiles) {
for (const auto& alloc : profile.allocs) {
g_combined_results.combined_allocs.push_back(alloc);
}

profile.allocs.clear();
}

return jl_profile_allocs_raw_results_t{
g_combined_results.combined_allocs.data(),
g_combined_results.combined_allocs.size(),
};
}

JL_DLLEXPORT void jl_stop_alloc_profile() {
g_alloc_profile_enabled = false;
}

JL_DLLEXPORT void jl_free_alloc_profile() {
// Free any allocs that remain in the per-thread profiles, that haven't
// been combined yet (which happens in fetch_alloc_profiles()).
for (auto& profile : g_alloc_profile.per_thread_profiles) {
for (auto alloc : profile.allocs) {
free(alloc.backtrace.data);
}
profile.allocs.clear();
}

// Free the allocs that have been already combined into the combined results object.
for (auto alloc : g_combined_results.combined_allocs) {
free(alloc.backtrace.data);
}

g_combined_results.combined_allocs.clear();
}

// == callback called into by the outside ==

void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *type) JL_NOTSAFEPOINT {
auto& global_profile = g_alloc_profile;
auto thread_id = jl_atomic_load_relaxed(&jl_current_task->tid);
auto& profile = global_profile.per_thread_profiles[thread_id];

auto sample_val = double(rand()) / double(RAND_MAX);
auto should_record = sample_val <= global_profile.sample_rate;
if (!should_record) {
return;
}

profile.allocs.emplace_back(jl_raw_alloc_t{
type,
get_raw_backtrace(),
size
});
}

} // extern "C"
51 changes: 51 additions & 0 deletions src/gc-alloc-profiler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// This file is a part of Julia. License is MIT: https://julialang.org/license

#ifndef JL_GC_ALLOC_PROFILER_H
#define JL_GC_ALLOC_PROFILER_H

#include "julia.h"
#include "ios.h"

#ifdef __cplusplus
extern "C" {
#endif

// ---------------------------------------------------------------------
// The public interface to call from Julia for allocations profiling
// ---------------------------------------------------------------------

// Forward-declaration to avoid depenency in header file.
struct jl_raw_alloc_t; // Defined in gc-alloc-profiler.cpp

typedef struct {
struct jl_raw_alloc_t *allocs;
size_t num_allocs;
} jl_profile_allocs_raw_results_t;

JL_DLLEXPORT void jl_start_alloc_profile(double sample_rate);
JL_DLLEXPORT jl_profile_allocs_raw_results_t jl_fetch_alloc_profile(void);
JL_DLLEXPORT void jl_stop_alloc_profile(void);
JL_DLLEXPORT void jl_free_alloc_profile(void);

// ---------------------------------------------------------------------
// Functions to call from GC when alloc profiling is enabled
// ---------------------------------------------------------------------

void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *typ) JL_NOTSAFEPOINT;

extern int g_alloc_profile_enabled;

#define jl_gc_unknown_type_tag ((jl_datatype_t*)0xdeadaa03)

static inline void maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *typ) JL_NOTSAFEPOINT {
if (__unlikely(g_alloc_profile_enabled)) {
_maybe_record_alloc_to_profile(val, size, typ);
}
}

#ifdef __cplusplus
}
#endif


#endif // JL_GC_ALLOC_PROFILER_H
41 changes: 38 additions & 3 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,7 @@ static void sweep_weak_refs(void)
// big value list

// Size includes the tag and the tag is not cleared!!
JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
static inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
{
maybe_collect(ptls);
size_t offs = offsetof(bigval_t, header);
Expand Down Expand Up @@ -970,6 +970,22 @@ JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
return jl_valueof(&v->header);
}

// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code.
JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
{
jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz);

maybe_record_alloc_to_profile(val, sz, jl_gc_unknown_type_tag);
return val;
}

// This wrapper exists only to prevent `jl_gc_big_alloc_inner` from being inlined into
// its callers. We provide an external-facing interface for callers, and inline `jl_gc_big_alloc_inner`
// into this. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t sz) {
return jl_gc_big_alloc_inner(ptls, sz);
}

// Sweep list rooted at *pv, removing and freeing any unmarked objects.
// Return pointer to last `next` field in the culled list.
static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT
Expand Down Expand Up @@ -1195,7 +1211,7 @@ static NOINLINE jl_taggedvalue_t *add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT
}

// Size includes the tag and the tag is not cleared!!
JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset,
int osize)
{
// Use the pool offset instead of the pool address as the argument
Expand Down Expand Up @@ -1251,6 +1267,23 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
return jl_valueof(v);
}

// Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code.
JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
int osize)
{
jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize);

maybe_record_alloc_to_profile(val, osize, jl_gc_unknown_type_tag);
return val;
}

// This wrapper exists only to prevent `jl_gc_pool_alloc_inner` from being inlined into
// its callers. We provide an external-facing interface for callers, and inline `jl_gc_pool_alloc_inner`
// into this. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset, int osize) {
return jl_gc_pool_alloc_inner(ptls, pool_offset, osize);
}

int jl_gc_classify_pools(size_t sz, int *osize)
{
if (sz > GC_MAX_SZCLASS)
Expand Down Expand Up @@ -3505,6 +3538,8 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
SetLastError(last_error);
#endif
errno = last_errno;
// jl_gc_managed_malloc is currently always used for allocating array buffers.
maybe_record_alloc_to_profile(b, sz, (jl_datatype_t*)jl_buff_tag);
return b;
}

Expand Down Expand Up @@ -3546,7 +3581,7 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
SetLastError(last_error);
#endif
errno = last_errno;

maybe_record_alloc_to_profile(b, sz, jl_gc_unknown_type_tag);
return b;
}

Expand Down
1 change: 1 addition & 0 deletions src/gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#endif
#endif
#include "julia_assert.h"
#include "gc-alloc-profiler.h"

#ifdef __cplusplus
extern "C" {
Expand Down
14 changes: 9 additions & 5 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "options.h"
#include "julia_locks.h"
#include "gc-alloc-profiler.h"
#include <uv.h>
#if !defined(_WIN32)
#include <unistd.h>
Expand Down Expand Up @@ -225,9 +226,9 @@ extern jl_array_t *jl_all_methods JL_GLOBALLY_ROOTED;
JL_DLLEXPORT extern int jl_lineno;
JL_DLLEXPORT extern const char *jl_filename;

JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
int osize);
JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t allocsz);
jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset,
int osize);
jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);
int jl_gc_classify_pools(size_t sz, int *osize);
extern jl_mutex_t gc_perm_lock;
void *jl_gc_perm_alloc_nolock(size_t sz, int zero,
Expand Down Expand Up @@ -336,14 +337,17 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
int pool_id = jl_gc_szclass(allocsz);
jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id];
int osize = jl_gc_sizeclasses[pool_id];
v = jl_gc_pool_alloc(ptls, (char*)p - (char*)ptls, osize);
// We call `jl_gc_pool_alloc_noinline` instead of `jl_gc_pool_alloc` to avoid double-counting in
// the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
v = jl_gc_pool_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
}
else {
if (allocsz < sz) // overflow in adding offs, size was "negative"
jl_throw(jl_memory_exception);
v = jl_gc_big_alloc(ptls, allocsz);
v = jl_gc_big_alloc_noinline(ptls, allocsz);
}
jl_set_typeof(v, ty);
maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
return v;
}

Expand Down
2 changes: 2 additions & 0 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,8 @@ typedef struct _jl_tls_states_t {
// Temporary backtrace buffer. Scanned for gc roots when bt_size > 0.
struct _jl_bt_element_t *bt_data; // JL_MAX_BT_SIZE + 1 elements long
size_t bt_size; // Size for backtrace in transit in bt_data
// Temporary backtrace buffer used only for allocations profiler.
struct _jl_bt_element_t *profiling_bt_buffer;
// Atomically set by the sender, reset by the handler.
volatile _Atomic(sig_atomic_t) signal_request; // TODO: no actual reason for this to be _Atomic
// Allow the sigint to be raised asynchronously
Expand Down
Loading