Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make GC counters thread-local #32217

Merged
merged 1 commit into from
Jun 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/atomics.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
__atomic_fetch_add(obj, arg, __ATOMIC_RELAXED)
# define jl_atomic_fetch_add(obj, arg) \
__atomic_fetch_add(obj, arg, __ATOMIC_SEQ_CST)
# define jl_atomic_add_fetch(obj, arg) \
__atomic_add_fetch(obj, arg, __ATOMIC_SEQ_CST)
# define jl_atomic_fetch_and_relaxed(obj, arg) \
__atomic_fetch_and(obj, arg, __ATOMIC_RELAXED)
# define jl_atomic_fetch_and(obj, arg) \
Expand Down Expand Up @@ -91,6 +93,8 @@
__atomic_load_n(obj, __ATOMIC_SEQ_CST)
# define jl_atomic_load_acquire(obj) \
__atomic_load_n(obj, __ATOMIC_ACQUIRE)
# define jl_atomic_load_relaxed(obj) \
__atomic_load_n(obj, __ATOMIC_RELAXED)
#elif defined(_COMPILER_MICROSOFT_)
# define jl_signal_fence() _ReadWriteBarrier()

Expand Down
131 changes: 90 additions & 41 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,19 @@ static size_t max_collect_interval = 1250000000UL;
static size_t max_collect_interval = 500000000UL;
#endif

// determine how often the given thread should atomically update
// the global allocation counter.
// NOTE: currently the same for all threads.
static int64_t per_thread_counter_interval(jl_ptls_t ptls)
{
if (jl_n_threads == 1)
return gc_num.interval;
size_t intvl = gc_num.interval / jl_n_threads / 2;
if (intvl < 1048576)
return 1048576;
return intvl;
}

// global variables for GC stats

// Resetting the object to a young object, this is used when marking the
Expand Down Expand Up @@ -802,16 +815,21 @@ void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v) JL_NOTSAFEPOINT
jl_gc_queue_root(v);
}

#define should_collect() (__unlikely(gc_num.allocd>0))

static inline int maybe_collect(jl_ptls_t ptls)
static inline void maybe_collect(jl_ptls_t ptls)
{
if (should_collect() || gc_debug_check_other()) {
int should_collect = 0;
if (ptls->gc_num.allocd >= 0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function need to use atomic relaxed load.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, the line number shows up really weird.... The comment was for combine_thread_gc_counts.

(Actually, to avoid UB, all access of these outside of the collection phase should be atomic.)

int64_t intvl = per_thread_counter_interval(ptls);
size_t localbytes = ptls->gc_num.allocd + intvl;
ptls->gc_num.allocd = -intvl;
should_collect = (jl_atomic_add_fetch(&gc_num.allocd, localbytes) >= 0);
}
if (should_collect || gc_debug_check_other()) {
jl_gc_collect(0);
return 1;
}
jl_gc_safepoint_(ptls);
return 0;
else {
jl_gc_safepoint_(ptls);
}
}

// weak references
Expand Down Expand Up @@ -876,12 +894,8 @@ JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
jl_throw(jl_memory_exception);
gc_invoke_callbacks(jl_gc_cb_notify_external_alloc_t,
gc_cblist_notify_external_alloc, (v, allocsz));
#ifdef JULIA_ENABLE_THREADING
jl_atomic_fetch_add(&gc_num.allocd, allocsz);
#else
gc_num.allocd += allocsz;
#endif
gc_num.bigalloc++;
ptls->gc_num.allocd += allocsz;
ptls->gc_num.bigalloc++;
#ifdef MEMDEBUG
memset(v, 0xee, allocsz);
#endif
Expand Down Expand Up @@ -973,14 +987,44 @@ void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT

void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
{
gc_num.allocd += sz;
jl_ptls_t ptls = jl_get_ptls_states();
ptls->gc_num.allocd += sz;
}

static void combine_thread_gc_counts(jl_gc_num_t *dest)
{
for (int i = 0; i < jl_n_threads; i++) {
jl_ptls_t ptls = jl_all_tls_states[i];
if (ptls) {
dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + per_thread_counter_interval(ptls));
dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed);
dest->malloc += jl_atomic_load_relaxed(&ptls->gc_num.malloc);
dest->realloc += jl_atomic_load_relaxed(&ptls->gc_num.realloc);
dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_num.poolalloc);
dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc);
dest->freecall += jl_atomic_load_relaxed(&ptls->gc_num.freecall);
}
}
}

static void reset_thread_gc_counts(void)
{
for (int i = 0; i < jl_n_threads; i++) {
jl_ptls_t ptls = jl_all_tls_states[i];
if (ptls) {
memset(&ptls->gc_num, 0, sizeof(jl_thread_gc_num_t));
ptls->gc_num.allocd = -per_thread_counter_interval(ptls);
}
}
}

void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
{
combine_thread_gc_counts(&gc_num);
live_bytes += (gc_num.deferred_alloc + (gc_num.allocd + gc_num.interval));
gc_num.allocd = -(int64_t)gc_num.interval;
gc_num.deferred_alloc = 0;
reset_thread_gc_counts();
}

static size_t array_nbytes(jl_array_t *a) JL_NOTSAFEPOINT
Expand Down Expand Up @@ -1098,16 +1142,9 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
#ifdef MEMDEBUG
return jl_gc_big_alloc(ptls, osize);
#endif
// FIXME - need JL_ATOMIC_FETCH_AND_ADD here
if (__unlikely((gc_num.allocd += osize) >= 0) || gc_debug_check_pool()) {
//gc_num.allocd -= osize;
jl_gc_collect(0);
//gc_num.allocd += osize;
}
else {
jl_gc_safepoint_(ptls);
}
gc_num.poolalloc++;
maybe_collect(ptls);
ptls->gc_num.allocd += osize;
ptls->gc_num.poolalloc++;
// first try to use the freelist
jl_taggedvalue_t *v = p->freelist;
if (v) {
Expand Down Expand Up @@ -2603,17 +2640,21 @@ JL_DLLEXPORT int jl_gc_is_enabled(void)

JL_DLLEXPORT int64_t jl_gc_total_bytes(void)
{
jl_gc_num_t num = gc_num;
combine_thread_gc_counts(&num);
// Sync this logic with `base/util.jl:GC_Diff`
return (gc_num.total_allocd + gc_num.deferred_alloc +
gc_num.allocd + gc_num.interval);
return (num.total_allocd + num.deferred_alloc +
num.allocd + num.interval);
}
JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
{
return gc_num.total_time;
}
JL_DLLEXPORT jl_gc_num_t jl_gc_num(void)
{
return gc_num;
jl_gc_num_t num = gc_num;
combine_thread_gc_counts(&num);
return num;
}

JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void)
Expand Down Expand Up @@ -2687,6 +2728,8 @@ static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp
// Only one thread should be running in this function
static int _jl_gc_collect(jl_ptls_t ptls, int full)
{
combine_thread_gc_counts(&gc_num);

jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
jl_gc_mark_sp_t sp;
gc_mark_sp_init(gc_cache, &sp);
Expand Down Expand Up @@ -2853,6 +2896,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, int full)
gc_num.total_time += pause;
gc_num.since_sweep = 0;
gc_num.freed = 0;
reset_thread_gc_counts();

return recollect;
}
Expand Down Expand Up @@ -2962,6 +3006,10 @@ void jl_init_thread_heap(jl_ptls_t ptls)
gc_cache->pc_stack = (void**)malloc(init_size * sizeof(void*));
gc_cache->pc_stack_end = gc_cache->pc_stack + init_size;
gc_cache->data_stack = (jl_gc_mark_data_t *)malloc(init_size * sizeof(jl_gc_mark_data_t));

memset(&ptls->gc_num, 0, sizeof(jl_thread_gc_num_t));
assert(gc_num.interval == default_collect_interval);
ptls->gc_num.allocd = -per_thread_counter_interval(ptls);
}

// System-wide initializations
Expand Down Expand Up @@ -2999,8 +3047,8 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
{
jl_ptls_t ptls = jl_get_ptls_states();
maybe_collect(ptls);
gc_num.allocd += sz;
gc_num.malloc++;
ptls->gc_num.allocd += sz;
ptls->gc_num.malloc++;
void *b = malloc(sz);
if (b == NULL)
jl_throw(jl_memory_exception);
Expand All @@ -3011,8 +3059,8 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
{
jl_ptls_t ptls = jl_get_ptls_states();
maybe_collect(ptls);
gc_num.allocd += nm*sz;
gc_num.malloc++;
ptls->gc_num.allocd += nm*sz;
ptls->gc_num.malloc++;
void *b = calloc(nm, sz);
if (b == NULL)
jl_throw(jl_memory_exception);
Expand All @@ -3021,9 +3069,10 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)

JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
{
jl_ptls_t ptls = jl_get_ptls_states();
free(p);
gc_num.freed += sz;
gc_num.freecall++;
ptls->gc_num.freed += sz;
ptls->gc_num.freecall++;
}

// older name for jl_gc_counted_free_with_size
Expand All @@ -3037,10 +3086,10 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
jl_ptls_t ptls = jl_get_ptls_states();
maybe_collect(ptls);
if (sz < old)
gc_num.freed += (old - sz);
ptls->gc_num.freed += (old - sz);
else
gc_num.allocd += (sz - old);
gc_num.realloc++;
ptls->gc_num.allocd += (sz - old);
ptls->gc_num.realloc++;
void *b = realloc(p, sz);
if (b == NULL)
jl_throw(jl_memory_exception);
Expand Down Expand Up @@ -3100,8 +3149,8 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
if (allocsz < sz) // overflow in adding offs, size was "negative"
jl_throw(jl_memory_exception);
gc_num.allocd += allocsz;
gc_num.malloc++;
ptls->gc_num.allocd += allocsz;
ptls->gc_num.malloc++;
void *b = malloc_cache_align(allocsz);
if (b == NULL)
jl_throw(jl_memory_exception);
Expand All @@ -3123,10 +3172,10 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
live_bytes += allocsz - oldsz;
}
else if (allocsz < oldsz)
gc_num.freed += (oldsz - allocsz);
ptls->gc_num.freed += (oldsz - allocsz);
else
gc_num.allocd += (allocsz - oldsz);
gc_num.realloc++;
ptls->gc_num.allocd += (allocsz - oldsz);
ptls->gc_num.realloc++;

void *b;
if (isaligned)
Expand Down
3 changes: 2 additions & 1 deletion src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -742,10 +742,11 @@ void _julia_init(JL_IMAGE_SEARCH rel)
jl_error("cannot generate code-coverage or track allocation information while generating a .o or .bc output file");
}

jl_gc_init();

jl_init_threading();
jl_init_intrinsic_properties();

jl_gc_init();
jl_gc_enable(0);

jl_resolve_sysimg_location(rel);
Expand Down
11 changes: 11 additions & 0 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,16 @@ typedef struct {
uint16_t osize; // size of objects in this pool
} jl_gc_pool_t;

typedef struct {
int64_t allocd;
int64_t freed;
uint64_t malloc;
uint64_t realloc;
uint64_t poolalloc;
uint64_t bigalloc;
uint64_t freecall;
} jl_thread_gc_num_t;

typedef struct {
// variable for tracking weak references
arraylist_t weak_refs;
Expand Down Expand Up @@ -156,6 +166,7 @@ struct _jl_tls_states_t {
volatile int8_t in_finalizer;
int8_t disable_gc;
jl_thread_heap_t heap;
jl_thread_gc_num_t gc_num;
uv_mutex_t sleep_lock;
uv_cond_t wake_signal;
volatile sig_atomic_t defer_signal;
Expand Down