diff --git a/src/gc-debug.c b/src/gc-debug.c index 56afd20fe2751..2c7cb3d8f7647 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -468,13 +468,226 @@ void objprofile_printall(void) } #endif +STATIC_INLINE double jl_ns2ms(int64_t t) +{ + return t / (double)1e6; +} + +STATIC_INLINE double jl_ns2s(int64_t t) +{ + return t / (double)1e9; +} + +#if defined(GC_TIME) || defined(GC_FINAL_STATS) +static uint64_t gc_premark_end; +static uint64_t gc_postmark_end; +void gc_settime_premark_end(void) +{ + gc_premark_end = jl_hrtime(); +} +void gc_settime_postmark_end(void) +{ + gc_postmark_end = jl_hrtime(); +} +#endif + +#ifdef GC_FINAL_STATS +#include +static double process_t0; +static size_t max_pg_count = 0; +static size_t total_freed_bytes = 0; +static uint64_t max_pause = 0; +static uint64_t total_sweep_time = 0; +static uint64_t total_mark_time = 0; +static uint64_t total_fin_time = 0; + +void gc_final_count_page(size_t pg_cnt) +{ + if (pg_cnt > max_pg_count) { + max_pg_count = pg_cnt; + } +} + +void gc_final_pause_end(int64_t t0, int64_t tend) +{ + uint64_t post_time = gc_postmark_end - gc_premark_end; + uint64_t sweep_pause = tend - gc_premark_end; + uint64_t pause = tend - t0; + total_freed_bytes += gc_num.freed; + total_sweep_time += sweep_pause - post_time; + total_fin_time += post_time; + max_pause = max_pause < pause ? pause : max_pause; + total_mark_time += gc_premark_end - t0; +} + +void jl_print_gc_stats(JL_STREAM *s) +{ + double gct = gc_num.total_time / 1e9; + malloc_stats(); + double ptime = jl_clock_now() - process_t0; + jl_printf(s, "exec time\t%.5f sec\n", ptime); + if (gc_num.pause > 0) { + jl_printf(s, "gc time \t%.5f sec (%2.1f%%) in %d (%d full) collections\n", + jl_ns2s(gc_num.total_time), + jl_ns2s(gc_num.total_time) / ptime * 100, + gc_num.pause, gc_num.full_sweep); + jl_printf(s, "gc pause \t%.2f ms avg\n\t\t%2.0f ms max\n", + jl_ns2ms(gc_num.total_time) / gc_num.pause, + jl_ns2ms(max_pause)); + jl_printf(s, "\t\t(%2d%% mark, %2d%% sweep, %2d%% finalizers)\n", + (int)(total_mark_time * 100 / gc_num.total_time), + (int)(total_sweep_time * 100 / gc_num.total_time), + (int)(total_fin_time * 100 / gc_num.total_time)); + } + int i = 0; + while (i < REGION_COUNT && regions[i].pages) i++; + jl_printf(s, "max allocated regions : %d\n", i); + struct mallinfo mi = mallinfo(); + jl_printf(s, "malloc size\t%d MB\n", mi.uordblks/1024/1024); + jl_printf(s, "max page alloc\t%ld MB\n", max_pg_count*GC_PAGE_SZ/1024/1024); + jl_printf(s, "total freed\t%" PRIuPTR " b\n", total_freed_bytes); + jl_printf(s, "free rate\t%.1f MB/sec\n", (total_freed_bytes/gct)/1024/1024); +} +#else +void jl_print_gc_stats(JL_STREAM *s) +{ +} +#endif + +#ifdef GC_TIME +static int64_t skipped_pages = 0; +static int64_t total_pages = 0; +static int64_t freed_pages = 0; +static int64_t pool_sweep_start; + +void gc_time_pool_start(void) +{ + skipped_pages = 0; + total_pages = 0; + freed_pages = 0; + pool_sweep_start = jl_hrtime(); +} + +void gc_time_count_page(int freedall, int pg_skpd) +{ + freed_pages += freedall; + skipped_pages += pg_skpd; + total_pages++; +} + +void gc_time_pool_end(int sweep_full) +{ + double sweep_pool_sec = (jl_hrtime() - pool_sweep_start) / 1e9; + double sweep_gb = total_pages * GC_PAGE_SZ / (double)(1024 * 1024 * 1024); + double sweep_speed = sweep_gb / sweep_pool_sec; + jl_printf(JL_STDOUT, + "GC sweep pools end %.2f ms at %.1f GB/s " + "(skipped %.2f %% of %" PRId64 ", swept %" PRId64 " pgs, " + "%" PRId64 " freed with %" PRId64 " lazily) %s\n", + sweep_pool_sec * 1000, sweep_speed, + (total_pages ? ((double)skipped_pages * 100) / total_pages : 0), + total_pages, total_pages - skipped_pages, + freed_pages, lazy_freed_pages, + sweep_full ? "full" : "quick"); +} + +static int64_t big_total; +static int64_t big_freed; +static int64_t big_reset; +static int64_t big_sweep_start; + +void gc_time_big_start(void) +{ + big_total = 0; + big_freed = 0; + big_reset = 0; + big_sweep_start = jl_hrtime(); +} + +void gc_time_count_big(int old_bits, int bits) +{ + big_total++; + big_reset += bits == GC_CLEAN; + big_freed += !(old_bits & GC_MARKED); +} + +void gc_time_big_end(void) +{ + double t_ms = jl_ns2ms(jl_hrtime() - big_sweep_start); + jl_printf(JL_STDOUT, "GC sweep big %.2f " + "(freed %" PRId64 " / %" PRId64 " with %" PRId64 " rst)\n", + t_ms, big_freed, big_total, big_reset); +} + +static int64_t mallocd_array_total; +static int64_t mallocd_array_freed; +static int64_t mallocd_array_sweep_start; + +void gc_time_mallocd_array_start(void) +{ + mallocd_array_total = 0; + mallocd_array_freed = 0; + mallocd_array_sweep_start = jl_hrtime(); +} + +void gc_time_count_mallocd_array(int bits) +{ + mallocd_array_total++; + mallocd_array_freed += !(bits & GC_MARKED); +} + +void gc_time_mallocd_array_end(void) +{ + double t_ms = jl_ns2ms(jl_hrtime() - big_sweep_start); + jl_printf(JL_STDOUT, "GC sweep arrays %.2f " + "(freed %" PRId64 " / %" PRId64 ")\n", + t_ms, mallocd_array_freed, mallocd_array_total); +} + +void gc_time_mark_pause(int64_t t0, int64_t scanned_bytes, + int64_t perm_scanned_bytes) +{ + int64_t last_remset_len = 0; + int64_t remset_nptr = 0; + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_tls_states[t_i]; + last_remset_len += ptls->heap.last_remset->len; + remset_nptr = ptls->heap.remset_nptr; + } + jl_printf(JL_STDOUT, "GC mark pause %.2f ms | " + "scanned %" PRId64 " kB = %" PRId64 " + %" PRId64 " | " + "remset %" PRId64 " %" PRId64 "\n", + jl_ns2ms(gc_premark_end - t0), + (scanned_bytes + perm_scanned_bytes) / 1024, + scanned_bytes / 1024, perm_scanned_bytes / 1024, + last_remset_len, remset_nptr); +} + +void gc_time_sweep_pause(uint64_t gc_end_t, int64_t actual_allocd, + int64_t live_bytes, int64_t estimate_freed, + int sweep_full) +{ + uint64_t sweep_pause = gc_end_t - gc_premark_end; + int pct = actual_allocd ? (gc_num.freed * 100) / actual_allocd : -1; + jl_printf(JL_STDOUT, "GC sweep pause %.2f ms live %" PRId64 " kB " + "(freed %" PRId64 " kB EST %" PRId64 " kB " + "[error %" PRId64 "] = %d%% of allocd b %ld) " + "(%.2f ms in post_mark) %s | next in %" PRId64 " kB\n", + jl_ns2ms(sweep_pause), live_bytes / 1024, + gc_num.freed / 1024, estimate_freed / 1024, + gc_num.freed - estimate_freed, pct, gc_num.since_sweep / 1024, + jl_ns2ms(gc_postmark_end - gc_premark_end), + sweep_full ? "full" : "quick", -gc_num.allocd / 1024); +} +#endif + void gc_debug_init(void) { #ifdef GC_DEBUG_ENV gc_stack_lo = (char*)gc_get_stack_ptr(); char *env = getenv("JULIA_GC_NO_GENERATIONAL"); if (env && strcmp(env, "0") != 0) - jl_gc_debug_env.sweep_mask = GC_MARKED; + jl_gc_debug_env.always_full = 1; env = getenv("JULIA_GC_WAIT_FOR_DEBUGGER"); jl_gc_debug_env.wait_for_debugger = env && strcmp(env, "0") != 0; gc_debug_alloc_init(&jl_gc_debug_env.pool, "POOL"); @@ -495,8 +708,116 @@ void gc_debug_init(void) htable_new(&obj_sizes[g], 0); } #endif + +#ifdef GC_FINAL_STATS + process_t0 = jl_clock_now(); +#endif +} + +// GC summary stats + +#ifdef MEMPROFILE +// TODO repair this and possibly merge with `gc_count_pool` +static size_t pool_stats(jl_gc_pool_t *p, size_t *pwaste, size_t *np, + size_t *pnold) +{ + gcval_t *v; + jl_gc_pagemeta_t *pg = p->pages; + size_t osize = p->osize; + size_t nused=0, nfree=0, npgs=0, nold = 0; + + while (pg != NULL) { + npgs++; + v = (gcval_t*)(pg->data + GC_PAGE_OFFSET); + char *lim = (char*)v + GC_PAGE_SZ - GC_PAGE_OFFSET - osize; + int i = 0; + while ((char*)v <= lim) { + if (!gc_marked(v)) { + nfree++; + } + else { + nused++; + if (gc_bits(v) == GC_MARKED) { + nold++; + } + } + v = (gcval_t*)((char*)v + osize); + i++; + } + jl_gc_pagemeta_t *nextpg = NULL; + pg = nextpg; + } + *pwaste = npgs * GC_PAGE_SZ - (nused * p->osize); + *np = npgs; + *pnold = nold; + if (npgs != 0) { + jl_printf(JL_STDOUT, + "%4d : %7d/%7d objects (%3d%% old), %5d pages, %5d kB, %5d kB waste\n", + p->osize, + nused, + nused+nfree, + nused ? (nold*100)/nused : 0, + npgs, + (nused*p->osize)/1024, + *pwaste/1024); + } + return nused*p->osize; +} + +void gc_stats_all_pool(void) +{ + size_t nb=0, w, tw=0, no=0,tp=0, nold=0,noldbytes=0, np, nol; + for (int i = 0; i < JL_GC_N_POOLS; i++) { + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_tls_states[t_i]; + size_t b = pool_stats(&ptls->heap.norm_pools[i], &w, &np, &nol); + nb += b; + no += (b / ptls->heap.norm_pools[i].osize); + tw += w; + tp += np; + nold += nol; + noldbytes += nol * ptls->heap.norm_pools[i].osize; + } + } + jl_printf(JL_STDOUT, + "%d objects (%d%% old), %d kB (%d%% old) total allocated, %d total fragments (%d%% overhead), in %d pages\n", + no, (nold*100)/no, nb/1024, (noldbytes*100)/nb, tw, (tw*100)/nb, tp); } +void gc_stats_big_obj(void) +{ + bigval_t *v = current_heap->big_objects; + size_t nused=0, nbytes=0; + while (v != NULL) { + if (gc_marked(&v->_data)) { + nused++; + nbytes += v->sz&~3; + } + v = v->next; + } + v = big_objects_marked; + size_t nused_old=0, nbytes_old=0; + while (v != NULL) { + if (gc_marked(&v->_data)) { + nused_old++; + nbytes_old += v->sz&~3; + } + v = v->next; + } + + mallocarray_t *ma = current_heap->mallocarrays; + while (ma != NULL) { + if (gc_marked(jl_astaggedvalue(ma->a))) { + nused++; + nbytes += array_nbytes(ma->a); + } + ma = ma->next; + } + + jl_printf(JL_STDOUT, "%d kB (%d%% old) in %d large objects (%d%% old)\n", (nbytes + nbytes_old)/1024, nbytes + nbytes_old ? (nbytes_old*100)/(nbytes + nbytes_old) : 0, nused + nused_old, nused+nused_old ? (nused_old*100)/(nused + nused_old) : 0); +} +#endif //MEMPROFILE + // Simple and dumb way to count cells with different gc bits in allocated pages // Use as ground truth for debugging memory-leak-like issues. static int64_t poolobj_sizes[4]; @@ -540,9 +861,9 @@ void gc_count_pool(void) memset(&poolobj_sizes, 0, sizeof(poolobj_sizes)); empty_pages = 0; for (int i = 0; i < REGION_COUNT; i++) { - if (regions[i].pages) { - gc_count_pool_region(®ions[i]); - } + if (!regions[i].pages) + break; + gc_count_pool_region(®ions[i]); } jl_safe_printf("****** Pool stat: ******\n"); for (int i = 0;i < 4;i++) diff --git a/src/gc-pages.c b/src/gc-pages.c index f3b301c6a36cb..ead16010cee15 100644 --- a/src/gc-pages.c +++ b/src/gc-pages.c @@ -152,9 +152,7 @@ NOINLINE void *jl_gc_alloc_page(void) VirtualAlloc(ptr, GC_PAGE_SZ, MEM_COMMIT, PAGE_READWRITE); #endif current_pg_count++; -#ifdef GC_FINAL_STATS - max_pg_count = max_pg_count < current_pg_count ? current_pg_count : max_pg_count; -#endif + gc_final_count_page(current_pg_count); JL_UNLOCK_NOGC(&pagealloc_lock); return ptr; } diff --git a/src/gc.c b/src/gc.c index 4648ef6e54615..bf397dc504279 100644 --- a/src/gc.c +++ b/src/gc.c @@ -262,18 +262,6 @@ static size_t max_collect_interval = 500000000UL; // global variables for GC stats -#define NS_TO_S(t) ((double)(t/1000)/(1000*1000)) -#define NS2MS(t) ((double)(t/1000)/1000) - -JL_DLLEXPORT size_t jl_gc_total_freed_bytes=0; -#ifdef GC_FINAL_STATS -static uint64_t max_pause = 0; -static uint64_t total_sweep_time = 0; -static uint64_t total_mark_time = 0; -static uint64_t total_fin_time = 0; -#endif -static int sweeping = 0; - /* * The state transition looks like : * @@ -298,7 +286,7 @@ static int sweeping = 0; * --[(quick)sweep && age<=promotion]--- */ -// A quick sweep is a sweep where sweep_mask == GC_MARKED_NOESC. +// A quick sweep is a sweep where `!sweep_full` // It means we won't touch GC_MARKED objects (old gen). // When a reachable object has survived more than PROMOTE_AGE+1 collections @@ -318,7 +306,7 @@ static int sweeping = 0; static int64_t scanned_bytes; // young bytes scanned while marking static int64_t perm_scanned_bytes; // old bytes scanned while marking -static int prev_sweep_mask = GC_MARKED; +static int prev_sweep_full = 1; #define inc_sat(v,s) v = (v) >= s ? s : (v)+1 @@ -477,14 +465,11 @@ static void sweep_weak_refs(void) size_t n = 0; size_t ndel = 0; size_t l = ptls->heap.weak_refs.len; - jl_weakref_t *wr; void **lst = ptls->heap.weak_refs.items; - void *tmp; -#define SWAP_wr(a,b) (tmp=a,a=b,b=tmp,1) if (l == 0) continue; - do { - wr = (jl_weakref_t*)lst[n]; + while (1) { + jl_weakref_t *wr = (jl_weakref_t*)lst[n]; if (gc_marked(jl_astaggedvalue(wr))) { // weakref itself is alive if (!gc_marked(jl_astaggedvalue(wr->value))) @@ -494,8 +479,12 @@ static void sweep_weak_refs(void) else { ndel++; } - } while ((n < l-ndel) && SWAP_wr(lst[n],lst[n+ndel])); - + if (n >= l - ndel) + break; + void *tmp = lst[n]; + lst[n] = lst[n + ndel]; + lst[n+ndel] = tmp; + } ptls->heap.weak_refs.len -= ndel; } } @@ -528,33 +517,27 @@ static NOINLINE void *alloc_big(size_t sz) return (void*)&v->header; } -static int big_total; -static int big_freed; -static int big_reset; - // Sweep list rooted at *pv, removing and freeing any unmarked objects. // Return pointer to last `next` field in the culled list. -static bigval_t **sweep_big_list(int sweep_mask, bigval_t **pv) +static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) { bigval_t *v = *pv; while (v != NULL) { bigval_t *nxt = v->next; - if (gc_marked(&v->header)) { + int bits = gc_bits(&v->header); + int old_bits = bits; + if (bits & GC_MARKED) { pv = &v->next; int age = v->age; - int bits = gc_bits(&v->header); - if (age >= PROMOTE_AGE) { - if (sweep_mask == GC_MARKED || bits == GC_MARKED_NOESC) { + if (age >= PROMOTE_AGE || bits == GC_MARKED) { + if (sweep_full || bits == GC_MARKED_NOESC) { bits = GC_QUEUED; } } else { inc_sat(age, PROMOTE_AGE); v->age = age; - if ((sweep_mask & bits) == sweep_mask) { - bits = GC_CLEAN; - big_reset++; - } + bits = GC_CLEAN; } gc_bits(&v->header) = bits; } @@ -568,21 +551,20 @@ static bigval_t **sweep_big_list(int sweep_mask, bigval_t **pv) memset(v, 0xbb, v->sz&~3); #endif jl_free_aligned(v); - big_freed++; } - big_total++; + gc_time_count_big(old_bits, bits); v = nxt; } return pv; } -static void sweep_big(int sweep_mask) +static void sweep_big(int sweep_full) { + gc_time_big_start(); for (int i = 0;i < jl_n_threads;i++) - sweep_big_list(sweep_mask, - &jl_all_tls_states[i]->heap.big_objects); - if (sweep_mask == GC_MARKED) { - bigval_t **last_next = sweep_big_list(sweep_mask, &big_objects_marked); + sweep_big_list(sweep_full, &jl_all_tls_states[i]->heap.big_objects); + if (sweep_full) { + bigval_t **last_next = sweep_big_list(sweep_full, &big_objects_marked); // Move all survivors from big_objects_marked list to big_objects list. if (jl_thread_heap.big_objects) jl_thread_heap.big_objects->prev = last_next; @@ -592,6 +574,7 @@ static void sweep_big(int sweep_mask) jl_thread_heap.big_objects->prev = &jl_thread_heap.big_objects; big_objects_marked = NULL; } + gc_time_big_end(); } // tracking Arrays with malloc'd storage @@ -638,19 +621,17 @@ static void jl_gc_free_array(jl_array_t *a) } } -static int mallocd_array_total; -static int mallocd_array_freed; - - static void sweep_malloced_arrays(void) { + gc_time_mallocd_array_start(); for (int t_i = 0;t_i < jl_n_threads;t_i++) { jl_tls_states_t *ptls = jl_all_tls_states[t_i]; mallocarray_t *ma = ptls->heap.mallocarrays; mallocarray_t **pma = &ptls->heap.mallocarrays; while (ma != NULL) { mallocarray_t *nxt = ma->next; - if (gc_marked(jl_astaggedvalue(ma->a))) { + int bits = jl_astaggedvalue(ma->a)->gc_bits; + if (bits & GC_MARKED) { pma = &ma->next; } else { @@ -659,12 +640,12 @@ static void sweep_malloced_arrays(void) jl_gc_free_array(ma->a); ma->next = ptls->heap.mafreelist; ptls->heap.mafreelist = ma; - mallocd_array_freed++; } - mallocd_array_total++; + gc_time_count_mallocd_array(bits); ma = nxt; } } + gc_time_mallocd_array_end(); } // pool allocation @@ -823,78 +804,59 @@ static inline int szclass(size_t sz) // sweep phase -static int skipped_pages = 0; -static int total_pages = 0; -static int freed_pages = 0; -static int lazy_freed_pages = 0; -static int page_done = 0; -static gcval_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, gcval_t **pfl,int,int); -static void sweep_pool_region(gcval_t ***pfl, int region_i, int sweep_mask) -{ - region_t *region = ®ions[region_i]; - - // the actual sweeping - int ub = 0; - int lb = region->lb; - for (int pg_i = 0; pg_i <= region->ub; pg_i++) { - uint32_t line = region->allocmap[pg_i]; - if (line) { - ub = pg_i; - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - jl_gc_pagemeta_t *pg = ®ion->meta[pg_i*32 + j]; - int p_n = pg->pool_n; - int t_n = pg->thread_n; - jl_tls_states_t *ptls = jl_all_tls_states[t_n]; - jl_gc_pool_t *p = &ptls->heap.norm_pools[p_n]; - int osize = pg->osize; - pfl[t_n * JL_GC_N_POOLS + p_n] = sweep_page(p, pg, pfl[t_n * JL_GC_N_POOLS + p_n], sweep_mask, osize); - } - } - } - else if (pg_i < lb) { - lb = pg_i; - } - } - region->ub = ub; - region->lb = lb; -} +int64_t lazy_freed_pages = 0; // Returns pointer to terminal pointer of list rooted at *pfl. -static gcval_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, gcval_t **pfl, int sweep_mask, int osize) -{ - int freedall; - gcval_t **prev_pfl = pfl; - gcval_t *v; - size_t old_nfree = 0, nfree = 0; - int pg_freedall = 0, pg_total = 0, pg_skpd = 0; - int obj_per_page = (GC_PAGE_SZ - GC_PAGE_OFFSET)/osize; +static gcval_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, gcval_t **pfl, int sweep_full, int osize) +{ char *data = pg->data; uint8_t *ages = pg->ages; - v = (gcval_t*)(data + GC_PAGE_OFFSET); + gcval_t *v = (gcval_t*)(data + GC_PAGE_OFFSET); char *lim = (char*)v + GC_PAGE_SZ - GC_PAGE_OFFSET - osize; - freedall = 1; - old_nfree += pg->nfree; - - if (!pg->has_marked) - goto free_page; + size_t old_nfree = pg->nfree; + size_t nfree; + + int freedall = 1; + int pg_skpd = 1; + if (!pg->has_marked) { + // lazy version: (empty) if the whole page was already unused, free it + // eager version: (freedall) free page as soon as possible + // the eager one uses less memory. + // FIXME - need to do accounting on a per-thread basis + // on quick sweeps, keep a few pages empty but allocated for performance + if (!sweep_full && lazy_freed_pages <= default_collect_interval / GC_PAGE_SZ) { + gcval_t *begin = reset_page(p, pg, 0); + gcval_t **pend = (gcval_t**)((char*)begin + ((int)pg->nfree - 1)*osize); + gcval_t *npg = p->newpages; + *pend = npg; + p->newpages = begin; + begin->next = (gcval_t*)0; + lazy_freed_pages++; + } + else { + jl_gc_free_page(data); + } + nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / osize; + goto done; + } // For quick sweep, we might be able to skip the page if the page doesn't // have any young live cell before marking. - if (sweep_mask == GC_MARKED_NOESC && !pg->has_young) { - assert(prev_sweep_mask == GC_MARKED_NOESC || pg->prev_nold >= pg->nold); - if (prev_sweep_mask == GC_MARKED_NOESC || pg->prev_nold == pg->nold) { + if (!sweep_full && !pg->has_young) { + assert(!prev_sweep_full || pg->prev_nold >= pg->nold); + if (!prev_sweep_full || pg->prev_nold == pg->nold) { // the position of the freelist begin/end in this page // is stored in its metadata if (pg->fl_begin_offset != (uint16_t)-1) { *pfl = page_pfl_beg(pg); - pfl = prev_pfl = (gcval_t**)page_pfl_end(pg); + pfl = (gcval_t**)page_pfl_end(pg); } - pg_skpd++; freedall = 0; - goto free_page; + nfree = pg->nfree; + goto done; } } + pg_skpd = 0; { // scope to avoid clang goto errors int has_marked = 0; int has_young = 0; @@ -915,7 +877,7 @@ static gcval_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, gcval_t **pfl if (*ages & msk || bits == GC_MARKED) { // old enough // `!age && bits == GC_MARKED` is possible for // non-first-class objects like `jl_binding_t` - if (sweep_mask == GC_MARKED || bits == GC_MARKED_NOESC) { + if (sweep_full || bits == GC_MARKED_NOESC) { bits = gc_bits(v) = GC_QUEUED; // promote } prev_nold++; @@ -944,88 +906,60 @@ static gcval_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, gcval_t **pfl pg->fl_end_offset = pfl_begin ? (char*)pfl - data : (uint16_t)-1; pg->nfree = pg_nfree; - if (sweep_mask == GC_MARKED) { + if (sweep_full) { pg->nold = 0; pg->prev_nold = prev_nold; } - page_done++; } - free_page: - pg_freedall += freedall; + nfree = pg->nfree; - // lazy version: (empty) if the whole page was already unused, free it - // eager version: (freedall) free page as soon as possible - // the eager one uses less memory. - pg_total++; - if (freedall) { - // FIXME - need to do accounting on a per-thread basis - // on quick sweeps, keep a few pages empty but allocated for performance - if (sweep_mask == GC_MARKED_NOESC && lazy_freed_pages <= default_collect_interval/GC_PAGE_SZ) { - gcval_t *begin = reset_page(p, pg, 0); - gcval_t **pend = (gcval_t**)((char*)begin + ((int)pg->nfree - 1)*osize); - gcval_t *npg = p->newpages; - *pend = npg; - p->newpages = begin; - begin->next = (gcval_t*)0; - lazy_freed_pages++; - pfl = prev_pfl; +done: + gc_time_count_page(freedall, pg_skpd); + gc_num.freed += (nfree - old_nfree) * osize; + return pfl; +} + +static void sweep_pool_region(gcval_t ***pfl, int region_i, int sweep_full) +{ + region_t *region = ®ions[region_i]; + + // the actual sweeping + int ub = 0; + int lb = region->lb; + for (int pg_i = 0; pg_i <= region->ub; pg_i++) { + uint32_t line = region->allocmap[pg_i]; + if (line) { + ub = pg_i; + for (int j = 0; j < 32; j++) { + if ((line >> j) & 1) { + jl_gc_pagemeta_t *pg = ®ion->meta[pg_i*32 + j]; + int p_n = pg->pool_n; + int t_n = pg->thread_n; + jl_tls_states_t *ptls = jl_all_tls_states[t_n]; + jl_gc_pool_t *p = &ptls->heap.norm_pools[p_n]; + int osize = pg->osize; + pfl[t_n * JL_GC_N_POOLS + p_n] = sweep_page(p, pg, pfl[t_n * JL_GC_N_POOLS + p_n], sweep_full, osize); + } + } } - else { - pfl = prev_pfl; -#ifdef MEMDEBUG - memset(pg->data, 0xbb, GC_PAGE_SZ); -#endif - jl_gc_free_page(data); -#ifdef MEMDEBUG - memset(pg, 0xbb, sizeof(jl_gc_pagemeta_t)); -#endif + else if (pg_i < lb) { + lb = pg_i; } - freed_pages++; - nfree += obj_per_page; } - else { - nfree += pg->nfree; - } - - skipped_pages += pg_skpd; - total_pages += pg_total; - gc_num.freed += (nfree - old_nfree)*osize; - return pfl; + region->ub = ub; + region->lb = lb; } -static void gc_sweep_once(int sweep_mask) +static void gc_sweep_other(int sweep_full) { -#ifdef GC_TIME - double t0 = jl_clock_now(); - mallocd_array_total = 0; - mallocd_array_freed = 0; -#endif sweep_malloced_arrays(); -#ifdef GC_TIME - jl_printf(JL_STDOUT, "GC sweep arrays %.2f (freed %d/%d)\n", (jl_clock_now() - t0)*1000, mallocd_array_freed, mallocd_array_total); - t0 = jl_clock_now(); - big_total = 0; - big_freed = 0; - big_reset = 0; -#endif - sweep_big(sweep_mask); -#ifdef GC_TIME - jl_printf(JL_STDOUT, "GC sweep big %.2f (freed %d/%d with %d rst)\n", (jl_clock_now() - t0)*1000, big_freed, big_total, big_reset); -#endif + sweep_big(sweep_full); } -// returns 0 if not finished -static int gc_sweep_inc(int sweep_mask) +static void gc_sweep_pool(int sweep_full) { -#ifdef GC_TIME - double t0 = jl_clock_now(); -#endif - skipped_pages = 0; - total_pages = 0; - freed_pages = 0; + gc_time_pool_start(); lazy_freed_pages = 0; - page_done = 0; - int finished = 1; gcval_t ***pfl = (gcval_t ***) alloca(jl_n_threads * JL_GC_N_POOLS * sizeof(gcval_t**)); @@ -1055,8 +989,9 @@ static int gc_sweep_inc(int sweep_mask) } for (int i = 0; i < REGION_COUNT; i++) { - if (regions[i].pages) - /*finished &= */sweep_pool_region(pfl, i, sweep_mask); + if (!regions[i].pages) + break; + sweep_pool_region(pfl, i, sweep_full); } @@ -1072,12 +1007,7 @@ static int gc_sweep_inc(int sweep_mask) } } -#ifdef GC_TIME - double sweep_pool_sec = jl_clock_now() - t0; - double sweep_speed = ((((double)total_pages)*GC_PAGE_SZ)/(1024*1024*1024))/sweep_pool_sec; - jl_printf(JL_STDOUT, "GC sweep pools %s %.2f at %.1f GB/s (skipped %d%% of %d, done %d pgs, %d freed with %d lazily) mask %d\n", finished ? "end" : "inc", sweep_pool_sec*1000, sweep_speed, total_pages ? (skipped_pages*100)/total_pages : 0, total_pages, page_done, freed_pages, lazy_freed_pages, sweep_mask); -#endif - return finished; + gc_time_pool_end(sweep_full); } // mark phase @@ -1440,7 +1370,7 @@ static int push_root(jl_value_t *v, int d, int bits) abort(); } - ret: +ret: if (gc_verifying) return bits; if ((bits == GC_MARKED) && (refyoung == GC_MARKED_NOESC)) { @@ -1453,39 +1383,27 @@ static int push_root(jl_value_t *v, int d, int bits) #undef MARK queue_the_root: - if (mark_sp >= mark_stack_size) grow_mark_stack(); + if (mark_sp >= mark_stack_size) + grow_mark_stack(); mark_stack[mark_sp++] = (jl_value_t*)v; return bits; } -static void visit_mark_stack_inc(int mark_mode) +static void visit_mark_stack(void) { - while(mark_sp > 0 && !should_timeout()) { + while (mark_sp > 0 && !should_timeout()) { jl_value_t *v = mark_stack[--mark_sp]; - assert(gc_bits(jl_astaggedvalue(v)) == GC_QUEUED || - gc_bits(jl_astaggedvalue(v)) == GC_MARKED || - gc_bits(jl_astaggedvalue(v)) == GC_MARKED_NOESC); + assert(gc_bits(jl_astaggedvalue(v))); push_root(v, 0, gc_bits(jl_astaggedvalue(v))); } -} - -static void visit_mark_stack(int mark_mode) -{ - visit_mark_stack_inc(mark_mode); assert(!mark_sp); } void jl_mark_box_caches(void); -#if defined(GCTIME) || defined(GC_FINAL_STATS) -double jl_clock_now(void); -#endif - extern jl_module_t *jl_old_base_module; extern jl_array_t *jl_module_init_order; -static int inc_count = 0; - // mark the initial root set void pre_mark(void) { @@ -1569,7 +1487,7 @@ void post_mark(arraylist_t *list, int dryrun) arraylist_push(&finalizer_list_marked, fin); } } - visit_mark_stack(GC_MARKED_NOESC); + visit_mark_stack(); } // collector entry point and control @@ -1620,235 +1538,156 @@ JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) } void jl_gc_sync_total_bytes(void) {last_gc_total_bytes = jl_gc_total_bytes();} -#if defined(MEMPROFILE) -static void all_pool_stats(void); -static void big_obj_stats(void); -#endif - -#if defined(GC_TIME) -static int saved_mark_sp = 0; -#endif -static int sweep_mask = GC_MARKED; #define MIN_SCAN_BYTES 1024*1024 // Only one thread should be running in this function static void _jl_gc_collect(int full, char *stack_hi) { uint64_t t0 = jl_hrtime(); - int recollect = 0; -#if defined(GC_TIME) - int wb_activations = mark_sp - saved_mark_sp; -#endif int64_t last_perm_scanned_bytes = perm_scanned_bytes; - if (!sweeping) { - inc_count++; - assert(mark_sp == 0); - - // 1. mark every object in the remset - reset_remset(); - for (int t_i = 0;t_i < jl_n_threads;t_i++) { - jl_tls_states_t *ptls = jl_all_tls_states[t_i]; - // avoid counting remembered objects & bindings twice in perm_scanned_bytes - for(int i = 0; i < ptls->heap.last_remset->len; i++) { - jl_value_t *item = (jl_value_t*)ptls->heap.last_remset->items[i]; - objprofile_count(jl_typeof(item), 2, 0); - gc_bits(jl_astaggedvalue(item)) = GC_MARKED; - } - for (int i = 0; i < ptls->heap.rem_bindings.len; i++) { - void *ptr = ptls->heap.rem_bindings.items[i]; - gc_bits(gc_val_buf(ptr)) = GC_MARKED; - } + assert(mark_sp == 0); - for (int i = 0; i < ptls->heap.last_remset->len; i++) { - jl_value_t *item = (jl_value_t*)ptls->heap.last_remset->items[i]; - push_root(item, 0, GC_MARKED); - } + // 1. mark every object in the remset + reset_remset(); + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_tls_states[t_i]; + // avoid counting remembered objects & bindings twice in perm_scanned_bytes + for (int i = 0; i < ptls->heap.last_remset->len; i++) { + jl_value_t *item = (jl_value_t*)ptls->heap.last_remset->items[i]; + objprofile_count(jl_typeof(item), 2, 0); + gc_bits(jl_astaggedvalue(item)) = GC_MARKED; } - - // 2. mark every object in a remembered binding - for (int t_i = 0;t_i < jl_n_threads;t_i++) { - jl_tls_states_t *ptls = jl_all_tls_states[t_i]; - int n_bnd_refyoung = 0; - for (int i = 0; i < ptls->heap.rem_bindings.len; i++) { - jl_binding_t *ptr = (jl_binding_t*)ptls->heap.rem_bindings.items[i]; - // A null pointer can happen here when the binding is cleaned up - // as an exception is thrown after it was already queued (#10221) - if (!ptr->value) continue; - if (gc_push_root(ptr->value, 0) == GC_MARKED_NOESC) { - ptls->heap.rem_bindings.items[n_bnd_refyoung] = ptr; - n_bnd_refyoung++; - } - } - ptls->heap.rem_bindings.len = n_bnd_refyoung; + for (int i = 0; i < ptls->heap.rem_bindings.len; i++) { + void *ptr = ptls->heap.rem_bindings.items[i]; + gc_bits(gc_val_buf(ptr)) = GC_MARKED; } - // 3. walk roots - pre_mark(); - visit_mark_stack(GC_MARKED_NOESC); - - gc_num.since_sweep += gc_num.allocd + (int64_t)gc_num.interval; - -#if defined(GC_TIME) || defined(GC_FINAL_STATS) - uint64_t mark_pause = jl_hrtime() - t0; -#endif -#ifdef GC_TIME - for (int t_i = 0;t_i < jl_n_threads;t_i++) { - jl_tls_states_t *ptls = jl_all_tls_states[t_i]; - jl_printf(JL_STDOUT, "GC mark pause %.2f ms | scanned %ld kB = %ld + %ld | stack %d -> %d (wb %d) | remset %d %d\n", NS2MS(mark_pause), (scanned_bytes + perm_scanned_bytes)/1024, scanned_bytes/1024, perm_scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, ptls->heap.last_remset->len, ptls->heap.remset_nptr); + for (int i = 0; i < ptls->heap.last_remset->len; i++) { + jl_value_t *item = (jl_value_t*)ptls->heap.last_remset->items[i]; + push_root(item, 0, GC_MARKED); } - saved_mark_sp = mark_sp; -#endif -#ifdef GC_FINAL_STATS - total_mark_time += mark_pause; -#endif } - #ifdef GC_TIME - int64_t bonus = -1, SAVE = -1, SAVE2 = -1, SAVE3 = -1, pct = -1; - #endif - int64_t estimate_freed = -1; -#if defined(GC_TIME) || defined(GC_FINAL_STATS) - uint64_t post_time = 0; -#endif - if (mark_sp == 0 || sweeping) { -#if defined(GC_TIME) || defined(GC_FINAL_STATS) - uint64_t sweep_t0 = jl_hrtime(); -#endif - int64_t actual_allocd = gc_num.since_sweep; - if (!sweeping) { - // marking is over -#if defined(GC_TIME) || defined(GC_FINAL_STATS) - post_time = jl_hrtime(); -#endif - // 4. check for objects to finalize - post_mark(&finalizer_list, 0); - if (prev_sweep_mask == GC_MARKED) { - post_mark(&finalizer_list_marked, 0); + // 2. mark every object in a remembered binding + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_tls_states[t_i]; + int n_bnd_refyoung = 0; + for (int i = 0; i < ptls->heap.rem_bindings.len; i++) { + jl_binding_t *ptr = (jl_binding_t*)ptls->heap.rem_bindings.items[i]; + // A null pointer can happen here when the binding is cleaned up + // as an exception is thrown after it was already queued (#10221) + if (!ptr->value) continue; + if (gc_push_root(ptr->value, 0) == GC_MARKED_NOESC) { + ptls->heap.rem_bindings.items[n_bnd_refyoung] = ptr; + n_bnd_refyoung++; } -#if defined(GC_TIME) || defined(GC_FINAL_STATS) - post_time = jl_hrtime() - post_time; -#endif - int64_t live_sz_ub = live_bytes + actual_allocd; - int64_t live_sz_est = scanned_bytes + perm_scanned_bytes; - estimate_freed = live_sz_ub - live_sz_est; - - gc_verify(); - -#if defined(MEMPROFILE) - all_pool_stats(); - big_obj_stats(); -#endif - objprofile_printall(); - objprofile_reset(); - gc_num.total_allocd += gc_num.since_sweep; - if (prev_sweep_mask == GC_MARKED_NOESC) - promoted_bytes += perm_scanned_bytes - last_perm_scanned_bytes; - // 5. next collection decision - int not_freed_enough = estimate_freed < (7*(actual_allocd/10)); - int nptr = 0; - for (int i = 0;i < jl_n_threads;i++) - nptr += jl_all_tls_states[i]->heap.remset_nptr; - int large_frontier = nptr*sizeof(void*) >= default_collect_interval; // many pointers in the intergen frontier => "quick" mark is not quick - if ((full || large_frontier || - ((not_freed_enough || promoted_bytes >= gc_num.interval) && - (promoted_bytes >= default_collect_interval || - prev_sweep_mask == GC_MARKED)) || - gc_check_heap_size(live_sz_ub, live_sz_est)) && - gc_num.pause > 1) { - gc_update_heap_size(live_sz_ub, live_sz_est); - if (prev_sweep_mask != GC_MARKED || full) { - if (full) recollect = 1; // TODO enable this? - } - if (large_frontier) - gc_num.interval = last_long_collect_interval; - if (not_freed_enough || large_frontier) { - if (gc_num.interval < default_collect_interval) - gc_num.interval = default_collect_interval; - else if (gc_num.interval <= 2*(max_collect_interval/5)) { - gc_num.interval = 5*(gc_num.interval/2); - } - } - last_long_collect_interval = gc_num.interval; - sweep_mask = GC_MARKED; - promoted_bytes = 0; + } + ptls->heap.rem_bindings.len = n_bnd_refyoung; + } + + // 3. walk roots + pre_mark(); + visit_mark_stack(); + gc_num.since_sweep += gc_num.allocd + (int64_t)gc_num.interval; + gc_settime_premark_end(); + gc_time_mark_pause(t0, scanned_bytes, perm_scanned_bytes); + int64_t actual_allocd = gc_num.since_sweep; + // marking is over + // 4. check for objects to finalize + post_mark(&finalizer_list, 0); + if (prev_sweep_full) + post_mark(&finalizer_list_marked, 0); + gc_settime_postmark_end(); + + int64_t live_sz_ub = live_bytes + actual_allocd; + int64_t live_sz_est = scanned_bytes + perm_scanned_bytes; + int64_t estimate_freed = live_sz_ub - live_sz_est; + + gc_verify(); + + gc_stats_all_pool(); + gc_stats_big_obj(); + objprofile_printall(); + objprofile_reset(); + gc_num.total_allocd += gc_num.since_sweep; + if (!prev_sweep_full) + promoted_bytes += perm_scanned_bytes - last_perm_scanned_bytes; + // 5. next collection decision + int not_freed_enough = estimate_freed < (7*(actual_allocd/10)); + int nptr = 0; + for (int i = 0;i < jl_n_threads;i++) + nptr += jl_all_tls_states[i]->heap.remset_nptr; + int large_frontier = nptr*sizeof(void*) >= default_collect_interval; // many pointers in the intergen frontier => "quick" mark is not quick + int sweep_full; + int recollect = 0; + if ((full || large_frontier || + ((not_freed_enough || promoted_bytes >= gc_num.interval) && + (promoted_bytes >= default_collect_interval || prev_sweep_full)) || + gc_check_heap_size(live_sz_ub, live_sz_est)) && + gc_num.pause > 1) { + gc_update_heap_size(live_sz_ub, live_sz_est); + recollect = full; + if (large_frontier) + gc_num.interval = last_long_collect_interval; + if (not_freed_enough || large_frontier) { + if (gc_num.interval < default_collect_interval) { + gc_num.interval = default_collect_interval; } - else { - gc_num.interval = default_collect_interval/2; - sweep_mask = gc_quick_sweep_mask; + else if (gc_num.interval <= 2*(max_collect_interval/5)) { + gc_num.interval = 5 * (gc_num.interval / 2); } - if (sweep_mask == GC_MARKED) - perm_scanned_bytes = 0; - scanned_bytes = 0; - // 5. start sweeping - sweep_weak_refs(); - gc_sweep_once(sweep_mask); - sweeping = 1; - gc_scrub(stack_hi); } - if (gc_sweep_inc(sweep_mask)) { - // sweeping is over - // 6. if it is a quick sweep, put back the remembered objects in queued state - // so that we don't trigger the barrier again on them. - for (int t_i = 0;t_i < jl_n_threads;t_i++) { - jl_tls_states_t *ptls = jl_all_tls_states[t_i]; - if (sweep_mask == GC_MARKED_NOESC) { - for (int i = 0; i < ptls->heap.remset->len; i++) { - gc_bits(jl_astaggedvalue(ptls->heap.remset->items[i])) = GC_QUEUED; - } - for (int i = 0; i < ptls->heap.rem_bindings.len; i++) { - void *ptr = ptls->heap.rem_bindings.items[i]; - gc_bits(gc_val_buf(ptr)) = GC_QUEUED; - } - } - else { - ptls->heap.remset->len = 0; - ptls->heap.rem_bindings.len = 0; - } + last_long_collect_interval = gc_num.interval; + sweep_full = 1; + promoted_bytes = 0; + } + else { + gc_num.interval = default_collect_interval / 2; + sweep_full = gc_sweep_always_full; + } + if (sweep_full) + perm_scanned_bytes = 0; + scanned_bytes = 0; + // 5. start sweeping + sweep_weak_refs(); + gc_sweep_other(sweep_full); + gc_scrub(stack_hi); + gc_sweep_pool(sweep_full); + // sweeping is over + // 6. if it is a quick sweep, put back the remembered objects in queued state + // so that we don't trigger the barrier again on them. + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_tls_states[t_i]; + if (!sweep_full) { + for (int i = 0; i < ptls->heap.remset->len; i++) { + gc_bits(jl_astaggedvalue(ptls->heap.remset->items[i])) = GC_QUEUED; + } + for (int i = 0; i < ptls->heap.rem_bindings.len; i++) { + void *ptr = ptls->heap.rem_bindings.items[i]; + gc_bits(gc_val_buf(ptr)) = GC_QUEUED; } - gc_num.full_sweep += sweep_mask != GC_MARKED_NOESC; - - sweeping = 0; -#ifdef GC_TIME - SAVE2 = gc_num.freed; - SAVE3 = gc_num.since_sweep; - pct = actual_allocd ? (gc_num.freed*100)/actual_allocd : -1; -#endif - prev_sweep_mask = sweep_mask; - - - gc_num.allocd = -(int64_t)gc_num.interval; - inc_count = 0; - live_bytes += -gc_num.freed + gc_num.since_sweep; - gc_num.since_sweep = 0; - jl_gc_total_freed_bytes += gc_num.freed; - gc_num.freed = 0; } -#if defined(GC_FINAL_STATS) || defined(GC_TIME) - uint64_t sweep_pause = jl_hrtime() - sweep_t0; -#endif -#ifdef GC_FINAL_STATS - total_sweep_time += sweep_pause - post_time; - total_fin_time += + post_time; -#endif -#ifdef GC_TIME - jl_printf(JL_STDOUT, "GC sweep pause %.2f ms live %ld kB (freed %d kB EST %d kB [error %d] = %d%% of allocd %d kB b/r %ld/%ld) (%.2f ms in post_mark) (marked in %d inc) mask %d | next in %d kB\n", NS2MS(sweep_pause), live_bytes/1024, SAVE2/1024, estimate_freed/1024, (SAVE2 - estimate_freed), pct, SAVE3/1024, bonus/1024, SAVE/1024, NS2MS(post_time), inc_count, sweep_mask, -gc_num.allocd/1024); -#endif + else { + ptls->heap.remset->len = 0; + ptls->heap.rem_bindings.len = 0; + } } - gc_num.pause++; - uint64_t pause = jl_hrtime() - t0; + + uint64_t gc_end_t = jl_hrtime(); + uint64_t pause = gc_end_t - t0; + gc_final_pause_end(t0, gc_end_t); + gc_time_sweep_pause(gc_end_t, actual_allocd, live_bytes, + estimate_freed, sweep_full); + gc_num.full_sweep += sweep_full; + prev_sweep_full = sweep_full; + gc_num.allocd = -(int64_t)gc_num.interval; + live_bytes += -gc_num.freed + gc_num.since_sweep; + gc_num.pause += !recollect; gc_num.total_time += pause; -#ifdef GC_FINAL_STATS - max_pause = max_pause < pause ? pause : max_pause; -#endif + gc_num.since_sweep = 0; + gc_num.freed = 0; -#ifdef GC_TIME - if (estimate_freed != SAVE2) { - // this should not happen but it does - // mostly because of gc_counted_* allocations - } -#endif if (recollect) { - gc_num.pause--; _jl_gc_collect(0, stack_hi); } } @@ -1916,29 +1755,6 @@ void *allocb(size_t sz) return &b->data[0]; } -/* this function is horribly broken in that it is unable to fix the bigval_t pointer chain after the realloc - * so it is basically just completely invalid in the bigval_t case -void *reallocb(void *b, size_t sz) -{ - buff_t *buff = gc_val_buf(b); - if (buff->pooled) { - void* b2 = allocb(sz); - memcpy(b2, b, page_metadata(buff)->osize); - return b2; - } - else { - size_t allocsz = LLT_ALIGN(sz + sizeof(bigval_t), 16); - if (allocsz < sz) // overflow in adding offs, size was "negative" - jl_throw(jl_memory_exception); - bigval_t *bv = bigval_header(buff); - bv = (bigval_t*)realloc_cache_align(bv, allocsz, bv->sz&~3); - if (bv == NULL) - jl_throw(jl_memory_exception); - return &bv->data[0]; - } -} -*/ - JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz) { size_t allocsz = sz + sizeof_jl_taggedvalue_t; @@ -1978,37 +1794,6 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_3w(void) return jl_valueof(tag); } -#ifdef GC_FINAL_STATS -static double process_t0; -size_t max_pg_count = 0; -#include -void jl_print_gc_stats(JL_STREAM *s) -{ - double gct = gc_num.total_time/1e9; - malloc_stats(); - double ptime = jl_clock_now()-process_t0; - jl_printf(s, "exec time\t%.5f sec\n", ptime); - if (gc_num.pause > 0) { - jl_printf(s, "gc time \t%.5f sec (%2.1f%%) in %d (%d full) collections\n", - NS_TO_S(gc_num.total_time), (NS_TO_S(gc_num.total_time)/ptime)*100, gc_num.pause, gc_num.full_sweep); - jl_printf(s, "gc pause \t%.2f ms avg\n\t\t%2.0f ms max\n", - NS2MS(gc_num.total_time)/gc_num.pause, NS2MS(max_pause)); - jl_printf(s, "\t\t(%2d%% mark, %2d%% sweep, %2d%% finalizers)\n", - (int)(total_mark_time * 100 / gc_num.total_time), - (int)(total_sweep_time * 100 / gc_num.total_time), - (int)(total_fin_time * 100 / gc_num.total_time)); - } - int i = 0; - while (i < REGION_COUNT && regions[i].pages) i++; - jl_printf(s, "max allocated regions : %d\n", i); - struct mallinfo mi = mallinfo(); - jl_printf(s, "malloc size\t%d MB\n", mi.uordblks/1024/1024); - jl_printf(s, "max page alloc\t%ld MB\n", max_pg_count*GC_PAGE_SZ/1024/1024); - jl_printf(s, "total freed\t%" PRIuPTR " b\n", jl_gc_total_freed_bytes); - jl_printf(s, "free rate\t%.1f MB/sec\n", (jl_gc_total_freed_bytes/gct)/1024/1024); -} -#endif - // Per-thread initialization (when threading is fully implemented) void jl_mk_thread_heap(jl_thread_heap_t *heap) { @@ -2047,10 +1832,6 @@ void jl_gc_init(void) last_long_collect_interval = default_collect_interval; gc_num.allocd = -default_collect_interval; -#ifdef GC_FINAL_STATS - process_t0 = jl_clock_now(); -#endif - #ifdef _P64 // on a big memory machine, set max_collect_interval to totalmem/ncores/2 size_t maxmem = (uv_get_total_memory()/jl_cpu_cores())/2; @@ -2059,109 +1840,6 @@ void jl_gc_init(void) #endif } -// GC summary stats - -#if defined(MEMPROFILE) -// TODO repair this -static size_t pool_stats(jl_gc_pool_t *p, size_t *pwaste, size_t *np, size_t *pnold) -{ - gcval_t *v; - jl_gc_pagemeta_t *pg = p->pages; - size_t osize = p->osize; - size_t nused=0, nfree=0, npgs=0, nold = 0; - - while (pg != NULL) { - npgs++; - v = (gcval_t*)(pg->data + GC_PAGE_OFFSET); - char *lim = (char*)v + GC_PAGE_SZ - GC_PAGE_OFFSET - osize; - int i = 0; - while ((char*)v <= lim) { - if (!gc_marked(v)) { - nfree++; - } - else { - nused++; - if (gc_bits(v) == GC_MARKED) { - nold++; - } - } - v = (gcval_t*)((char*)v + osize); - i++; - } - jl_gc_pagemeta_t *nextpg = NULL; - pg = nextpg; - } - *pwaste = npgs * GC_PAGE_SZ - (nused * p->osize); - *np = npgs; - *pnold = nold; - if (npgs != 0) { - jl_printf(JL_STDOUT, - "%4d : %7d/%7d objects (%3d%% old), %5d pages, %5d kB, %5d kB waste\n", - p->osize, - nused, - nused+nfree, - nused ? (nold*100)/nused : 0, - npgs, - (nused*p->osize)/1024, - *pwaste/1024); - } - return nused*p->osize; -} - -static void all_pool_stats(void) -{ - size_t nb=0, w, tw=0, no=0,tp=0, nold=0,noldbytes=0, np, nol; - for (int i = 0; i < JL_GC_N_POOLS; i++) { - for (int t_i = 0;t_i < jl_n_threads;t_i++) { - jl_tls_states_t *ptls = jl_all_tls_states[t_i]; - size_t b = pool_stats(&ptls->heap.norm_pools[i], &w, &np, &nol); - nb += b; - no += (b / ptls->heap.norm_pools[i].osize); - tw += w; - tp += np; - nold += nol; - noldbytes += nol * ptls->heap.norm_pools[i].osize; - } - } - jl_printf(JL_STDOUT, - "%d objects (%d%% old), %d kB (%d%% old) total allocated, %d total fragments (%d%% overhead), in %d pages\n", - no, (nold*100)/no, nb/1024, (noldbytes*100)/nb, tw, (tw*100)/nb, tp); -} - -static void big_obj_stats(void) -{ - bigval_t *v = current_heap->big_objects; - size_t nused=0, nbytes=0; - while (v != NULL) { - if (gc_marked(&v->_data)) { - nused++; - nbytes += v->sz&~3; - } - v = v->next; - } - v = big_objects_marked; - size_t nused_old=0, nbytes_old=0; - while (v != NULL) { - if (gc_marked(&v->_data)) { - nused_old++; - nbytes_old += v->sz&~3; - } - v = v->next; - } - - mallocarray_t *ma = current_heap->mallocarrays; - while (ma != NULL) { - if (gc_marked(jl_astaggedvalue(ma->a))) { - nused++; - nbytes += array_nbytes(ma->a); - } - ma = ma->next; - } - - jl_printf(JL_STDOUT, "%d kB (%d%% old) in %d large objects (%d%% old)\n", (nbytes + nbytes_old)/1024, nbytes + nbytes_old ? (nbytes_old*100)/(nbytes + nbytes_old) : 0, nused + nused_old, nused+nused_old ? (nused_old*100)/(nused + nused_old) : 0); -} -#endif //MEMPROFILE - JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { sz += JL_SMALL_BYTE_ALIGNMENT; diff --git a/src/gc.h b/src/gc.h index 1e975807fcddf..905b47e2899cb 100644 --- a/src/gc.h +++ b/src/gc.h @@ -56,7 +56,7 @@ typedef struct { } jl_alloc_num_t; typedef struct { - int sweep_mask; + int always_full; int wait_for_debugger; jl_alloc_num_t pool; jl_alloc_num_t other; @@ -199,10 +199,7 @@ extern bigval_t *big_objects_marked; extern arraylist_t finalizer_list; extern arraylist_t finalizer_list_marked; extern arraylist_t to_finalize; - -// Counters -// GC_FINAL_STATS only -extern size_t max_pg_count; +extern int64_t lazy_freed_pages; #define bigval_header(data) container_of((data), bigval_t, header) @@ -270,6 +267,66 @@ void jl_gc_free_page(void *p); // GC debug +#if defined(GC_TIME) || defined(GC_FINAL_STATS) +void gc_settime_premark_end(void); +void gc_settime_postmark_end(void); +#else +#define gc_settime_premark_end() +#define gc_settime_postmark_end() +#endif + +#ifdef GC_FINAL_STATS +void gc_final_count_page(size_t pg_cnt); +void gc_final_pause_end(int64_t t0, int64_t tend); +#else +#define gc_final_count_page(pg_cnt) +#define gc_final_pause_end(t0, tend) +#endif + +#ifdef GC_TIME +void gc_time_pool_start(void); +void gc_time_count_page(int freedall, int pg_skpd); +void gc_time_pool_end(int sweep_full); + +void gc_time_big_start(void); +void gc_time_count_big(int old_bits, int bits); +void gc_time_big_end(void); + +void gc_time_mallocd_array_start(void); +void gc_time_count_mallocd_array(int bits); +void gc_time_mallocd_array_end(void); + +void gc_time_mark_pause(int64_t t0, int64_t scanned_bytes, + int64_t perm_scanned_bytes); +void gc_time_sweep_pause(uint64_t gc_end_t, int64_t actual_allocd, + int64_t live_bytes, int64_t estimate_freed, + int sweep_full); +#else +#define gc_time_pool_start() +STATIC_INLINE void gc_time_count_page(int freedall, int pg_skpd) +{ + (void)freedall; + (void)pg_skpd; +} +#define gc_time_pool_end(sweep_full) +#define gc_time_big_start() +STATIC_INLINE void gc_time_count_big(int old_bits, int bits) +{ + (void)old_bits; + (void)bits; +} +#define gc_time_big_end() +#define gc_time_mallocd_array_start() +STATIC_INLINE void gc_time_count_mallocd_array(int bits) +{ + (void)bits; +} +#define gc_time_mallocd_array_end() +#define gc_time_mark_pause(t0, scanned_bytes, perm_scanned_bytes) +#define gc_time_sweep_pause(gc_end_t, actual_allocd, live_bytes, \ + estimate_freed, sweep_full) +#endif + #ifdef GC_VERIFY extern jl_value_t *lostval; void gc_verify(void); @@ -312,13 +369,13 @@ extern int gc_verifying; #ifdef GC_DEBUG_ENV JL_DLLEXPORT extern jl_gc_debug_env_t jl_gc_debug_env; -#define gc_quick_sweep_mask jl_gc_debug_env.sweep_mask +#define gc_sweep_always_full jl_gc_debug_env.always_full int gc_debug_check_other(void); int gc_debug_check_pool(void); void gc_debug_print(void); void gc_scrub(char *stack_hi); #else -#define gc_quick_sweep_mask GC_MARKED_NOESC +#define gc_sweep_always_full 0 static inline int gc_debug_check_other(void) { return 0; @@ -354,6 +411,14 @@ static inline void objprofile_reset(void) } #endif +#ifdef MEMPROFILE +static void gc_stats_all_pool(void); +static void gc_stats_big_obj(void); +#else +#define gc_stats_all_pool() +#define gc_stats_big_obj() +#endif + // For debugging void gc_count_pool(void); diff --git a/src/julia_internal.h b/src/julia_internal.h index a5d859d9f1ef6..add229fc5ab0b 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -128,11 +128,7 @@ STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr) // parent isa jl_val void gc_debug_print_status(void); void gc_debug_critical_error(void); -#if defined(GC_FINAL_STATS) void jl_print_gc_stats(JL_STREAM *s); -#else -#define jl_print_gc_stats(s) ((void)s) -#endif int jl_assign_type_uid(void); jl_value_t *jl_cache_type_(jl_datatype_t *type); int jl_get_t_uid_ctr(void); diff --git a/src/options.h b/src/options.h index 53d477d76c104..f2e7b44e25ce2 100644 --- a/src/options.h +++ b/src/options.h @@ -56,8 +56,8 @@ // MEMPROFILE prints pool summary statistics after every GC //#define MEMPROFILE -// GCTIME prints time taken by each phase of GC -//#define GC_TIME +// GC_TIME prints time taken by each phase of GC +// #define GC_TIME // OBJPROFILE counts objects by type // #define OBJPROFILE