From f772c9a18db2abdf9df4b048be874e31488fc6ae Mon Sep 17 00:00:00 2001 From: Jarrett Revels Date: Thu, 18 Feb 2016 10:38:01 -0500 Subject: [PATCH] request 64-byte alignment instead of 16-byte alignment for large objects This should align the requested memory with cache lines, thus improving register loads. --- src/array.c | 15 +++++----- src/gc.c | 66 +++++++++++++++++++++++--------------------- src/julia_internal.h | 3 ++ 3 files changed, 44 insertions(+), 40 deletions(-) diff --git a/src/array.c b/src/array.c index 32067535644fb..b5c5743b1b4f4 100644 --- a/src/array.c +++ b/src/array.c @@ -18,7 +18,6 @@ extern "C" { #define JL_ARRAY_ALIGN(jl_value, nbytes) LLT_ALIGN(jl_value, nbytes) - // array constructors --------------------------------------------------------- static inline int store_unboxed(jl_value_t *el_type) @@ -74,13 +73,13 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims, } int ndimwords = jl_array_ndimwords(ndims); - int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), 16); + int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), JL_CACHE_BYTE_ALIGNMENT); if (tot <= ARRAY_INLINE_NBYTES) { if (isunboxed && elsz >= 4) - tsz = JL_ARRAY_ALIGN(tsz, 16); // align data area 16 + tsz = JL_ARRAY_ALIGN(tsz, JL_SMALL_BYTE_ALIGNMENT); // align data area size_t doffs = tsz; tsz += tot; - tsz = JL_ARRAY_ALIGN(tsz, 16); // align whole object 16 + tsz = JL_ARRAY_ALIGN(tsz, JL_SMALL_BYTE_ALIGNMENT); // align whole object a = (jl_array_t*)jl_gc_allocobj(tsz); jl_set_typeof(a, atype); a->flags.how = 0; @@ -90,7 +89,7 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims, } } else { - tsz = JL_ARRAY_ALIGN(tsz, 16); // align whole object 16 + tsz = JL_ARRAY_ALIGN(tsz, JL_CACHE_BYTE_ALIGNMENT); // align whole object a = (jl_array_t*)jl_gc_allocobj(tsz); JL_GC_PUSH1(&a); jl_set_typeof(a, atype); @@ -157,7 +156,7 @@ JL_DLLEXPORT jl_array_t *jl_reshape_array(jl_value_t *atype, jl_array_t *data, size_t ndims = jl_nfields(dims); int ndimwords = jl_array_ndimwords(ndims); - int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t) + sizeof(void*), 16); + int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t) + sizeof(void*), JL_SMALL_BYTE_ALIGNMENT); a = (jl_array_t*)jl_gc_allocobj(tsz); jl_set_typeof(a, atype); a->flags.pooled = tsz <= GC_MAX_SZCLASS; @@ -233,7 +232,7 @@ JL_DLLEXPORT jl_array_t *jl_ptr_to_array_1d(jl_value_t *atype, void *data, elsz = sizeof(void*); int ndimwords = jl_array_ndimwords(1); - int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), 16); + int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), JL_CACHE_BYTE_ALIGNMENT); a = (jl_array_t*)jl_gc_allocobj(tsz); jl_set_typeof(a, atype); a->flags.pooled = tsz <= GC_MAX_SZCLASS; @@ -284,7 +283,7 @@ JL_DLLEXPORT jl_array_t *jl_ptr_to_array(jl_value_t *atype, void *data, elsz = sizeof(void*); int ndimwords = jl_array_ndimwords(ndims); - int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), 16); + int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), JL_CACHE_BYTE_ALIGNMENT); a = (jl_array_t*)jl_gc_allocobj(tsz); jl_set_typeof(a, atype); a->flags.pooled = tsz <= GC_MAX_SZCLASS; diff --git a/src/gc.c b/src/gc.c index 0c617dc3e7a6f..2ba37c5026940 100644 --- a/src/gc.c +++ b/src/gc.c @@ -135,6 +135,13 @@ typedef struct _bigval_t { size_t sz; uintptr_t age : 2; }; + #ifdef _P64 // Add padding so that char data[] below is 64-byte aligned + // (8 pointers of 8 bytes each) - (4 other pointers in struct) + void *_padding[8 - 4]; + #else + // (16 pointers of 4 bytes each) - (4 other pointers in struct) + void *_padding[16 - 4]; + #endif //struct buff_t <>; union { uintptr_t header; @@ -146,7 +153,7 @@ typedef struct _bigval_t { #if !defined(_COMPILER_MICROSOFT_) int _dummy[0]; #endif - // must be 16-aligned here, in 32 & 64b + // must be 64-byte aligned here, in 32 & 64 bit modes char data[]; } bigval_t; @@ -171,7 +178,7 @@ typedef struct _pool_t { #define GC_PAGE_LG2 14 // log2(size of a page) #define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k -#define GC_PAGE_OFFSET (16 - (sizeof_jl_taggedvalue_t % 16)) +#define GC_PAGE_OFFSET (JL_SMALL_BYTE_ALIGNMENT - (sizeof_jl_taggedvalue_t % JL_SMALL_BYTE_ALIGNMENT)) // pool page metadata typedef struct _gcpage_t { @@ -437,15 +444,8 @@ static int jl_gc_finalizers_inhibited; // don't run finalizers during codegen #1 // malloc wrappers, aligned allocation -#if defined(_P64) || defined(__APPLE__) -#define malloc_a16(sz) malloc(sz) -#define realloc_a16(p, sz, oldsz) realloc((p), (sz)) -#define free_a16(p) free(p) -#else -#define malloc_a16(sz) jl_malloc_aligned(sz, 16) -#define realloc_a16(p, sz, oldsz) jl_realloc_aligned(p, sz, oldsz, 16) -#define free_a16(p) jl_free_aligned(p) -#endif +#define malloc_cache_align(sz) jl_malloc_aligned(sz, JL_CACHE_BYTE_ALIGNMENT) +#define realloc_cache_align(p, sz, oldsz) jl_realloc_aligned(p, sz, oldsz, JL_CACHE_BYTE_ALIGNMENT) static void schedule_finalization(void *o, void *f) { @@ -1011,10 +1011,10 @@ static NOINLINE void *alloc_big(size_t sz) { maybe_collect(); size_t offs = offsetof(bigval_t, header); - size_t allocsz = LLT_ALIGN(sz + offs, 16); + size_t allocsz = LLT_ALIGN(sz + offs, JL_CACHE_BYTE_ALIGNMENT); if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); - bigval_t *v = (bigval_t*)malloc_a16(allocsz); + bigval_t *v = (bigval_t*)malloc_cache_align(allocsz); if (v == NULL) jl_throw(jl_memory_exception); jl_atomic_fetch_add(&allocd_bytes, allocsz); @@ -1074,7 +1074,7 @@ static bigval_t **sweep_big_list(int sweep_mask, bigval_t **pv) #ifdef MEMDEBUG memset(v, 0xbb, v->sz&~3); #endif - free_a16(v); + jl_free_aligned(v); big_freed++; } big_total++; @@ -1141,7 +1141,7 @@ static void jl_gc_free_array(jl_array_t *a) if (a->flags.how == 2) { char *d = (char*)a->data - a->offset*a->elsize; if (a->flags.isaligned) - free_a16(d); + jl_free_aligned(d); else free(d); freed_bytes += array_nbytes(a); @@ -2500,7 +2500,7 @@ void *reallocb(void *b, size_t sz) if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); bigval_t *bv = bigval_header(buff); - bv = (bigval_t*)realloc_a16(bv, allocsz, bv->sz&~3); + bv = (bigval_t*)realloc_cache_align(bv, allocsz, bv->sz&~3); if (bv == NULL) jl_throw(jl_memory_exception); return &bv->data[0]; @@ -2539,7 +2539,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_0w(void) JL_DLLEXPORT jl_value_t *jl_gc_alloc_1w(void) { - const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*), 16); + const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*), JL_SMALL_BYTE_ALIGNMENT); void *tag = NULL; #ifdef MEMDEBUG tag = alloc_big(sz); @@ -2552,7 +2552,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_1w(void) JL_DLLEXPORT jl_value_t *jl_gc_alloc_2w(void) { - const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*) * 2, 16); + const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*) * 2, JL_SMALL_BYTE_ALIGNMENT); void *tag = NULL; #ifdef MEMDEBUG tag = alloc_big(sz); @@ -2565,7 +2565,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_2w(void) JL_DLLEXPORT jl_value_t *jl_gc_alloc_3w(void) { - const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*) * 3, 16); + const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*) * 3, JL_SMALL_BYTE_ALIGNMENT); void *tag = NULL; #ifdef MEMDEBUG tag = alloc_big(sz); @@ -2612,7 +2612,7 @@ jl_thread_heap_t *jl_mk_thread_heap(void) #ifdef JULIA_ENABLE_THREADING // Cache-aligned malloc jl_thread_heap = - (jl_thread_heap_t*)jl_malloc_aligned(sizeof(jl_thread_heap_t), 64); + (jl_thread_heap_t*)jl_malloc_aligned(sizeof(jl_thread_heap_t), JL_CACHE_BYTE_ALIGNMENT); #endif FOR_CURRENT_HEAP () { const int *szc = sizeclasses; @@ -2781,6 +2781,7 @@ static void big_obj_stats(void) JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { + sz += JL_SMALL_BYTE_ALIGNMENT; maybe_collect(); allocd_bytes += sz; gc_num.malloc++; @@ -2792,6 +2793,7 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) { + nm += JL_SMALL_BYTE_ALIGNMENT; maybe_collect(); allocd_bytes += nm*sz; gc_num.malloc++; @@ -2804,15 +2806,15 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) JL_DLLEXPORT void jl_gc_counted_free(void *p, size_t sz) { free(p); - freed_bytes += sz; + freed_bytes += sz + JL_SMALL_BYTE_ALIGNMENT; gc_num.freecall++; } -JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, - size_t sz) +JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz) { + old += JL_SMALL_BYTE_ALIGNMENT; + sz += JL_SMALL_BYTE_ALIGNMENT; maybe_collect(); - if (sz < old) freed_bytes += (old - sz); else @@ -2826,7 +2828,7 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, JL_DLLEXPORT void *jl_malloc(size_t sz) { - int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + 16); + int64_t *p = (int64_t *)jl_gc_counted_malloc(sz); p[0] = sz; return (void *)(p + 2); } @@ -2835,7 +2837,7 @@ JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz) { int64_t *p; size_t nmsz = nm*sz; - p = (int64_t *)jl_gc_counted_calloc(nmsz + 16, 1); + p = (int64_t *)jl_gc_counted_calloc(nmsz, 1); p[0] = nmsz; return (void *)(p + 2); } @@ -2844,14 +2846,14 @@ JL_DLLEXPORT void jl_free(void *p) { int64_t *pp = (int64_t *)p - 2; size_t sz = pp[0]; - jl_gc_counted_free(pp, sz + 16); + jl_gc_counted_free(pp, sz); } JL_DLLEXPORT void *jl_realloc(void *p, size_t sz) { int64_t *pp = (int64_t *)p - 2; size_t szold = pp[0]; - int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold + 16, sz + 16); + int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz); pnew[0] = sz; return (void *)(pnew + 2); } @@ -2859,12 +2861,12 @@ JL_DLLEXPORT void *jl_realloc(void *p, size_t sz) JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) { maybe_collect(); - size_t allocsz = LLT_ALIGN(sz, 16); + size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT); if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); allocd_bytes += allocsz; gc_num.malloc++; - void *b = malloc_a16(allocsz); + void *b = malloc_cache_align(allocsz); if (b == NULL) jl_throw(jl_memory_exception); return b; @@ -2875,7 +2877,7 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz, { maybe_collect(); - size_t allocsz = LLT_ALIGN(sz, 16); + size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT); if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); @@ -2891,7 +2893,7 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz, void *b; if (isaligned) - b = realloc_a16(d, allocsz, oldsz); + b = realloc_cache_align(d, allocsz, oldsz); else b = realloc(d, allocsz); if (b == NULL) diff --git a/src/julia_internal.h b/src/julia_internal.h index 118fefc1958ce..84ebb6e86f0a9 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -499,6 +499,9 @@ STATIC_INLINE void jl_free_aligned(void *p) } #endif +#define JL_SMALL_BYTE_ALIGNMENT 16 +#define JL_CACHE_BYTE_ALIGNMENT 64 + #ifdef __cplusplus } #endif