From 5d3312142a2926bad34924971208673818947120 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 8 Jul 2013 01:07:05 +0800 Subject: [PATCH] Refs #221 #246. Fixed the overflowing stack bug in mutlithreading BLAS3. When NUM_THREADS(MAX_CPU_NUNBERS) is very large ,e.g. 256. typedef struct { volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; job_t job[MAX_CPU_NUMBER]; The job array is equal 8MB. Thus, We use malloc instead of stack allocation. --- driver/level3/level3_gemm3m_thread.c | 25 ++++++++++++++++++++++++- driver/level3/level3_syrk_threaded.c | 23 +++++++++++++++++++++++ driver/level3/level3_thread.c | 24 ++++++++++++++++++++++++ driver/others/memory.c | 9 ++------- lapack/getrf/getrf_parallel.c | 25 ++++++++++++++++++++++++- lapack/potrf/potrf_parallel.c | 25 +++++++++++++++++++++++++ 6 files changed, 122 insertions(+), 9 deletions(-) diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index bddb5eb878..036eebb04b 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -48,6 +48,12 @@ #define SWITCH_RATIO 2 #endif +//The array of job_t may overflow the stack. +//Instead, use malloc to alloc job_t. +#if MAX_CPU_NUMBER > 210 +#define USE_ALLOC_HEAP +#endif + #ifndef GEMM3M_LOCAL #if defined(NN) #define GEMM3M_LOCAL GEMM3M_NN @@ -836,7 +842,11 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG BLASLONG range_M[MAX_CPU_NUMBER + 1]; BLASLONG range_N[MAX_CPU_NUMBER + 1]; - job_t job[MAX_CPU_NUMBER]; +#ifndef USE_ALLOC_HEAP + job_t job[MAX_CPU_NUMBER]; +#else + job_t * job = NULL; +#endif BLASLONG num_cpu_m, num_cpu_n; @@ -866,6 +876,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG newarg.alpha = args -> alpha; newarg.beta = args -> beta; newarg.nthreads = args -> nthreads; + +#ifdef USE_ALLOC_HEAP + job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); + if(job==NULL){ + fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); + exit(1); + } +#endif + newarg.common = (void *)job; if (!range_m) { @@ -945,6 +964,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG exec_blas(num_cpu_m, queue); } +#ifdef USE_ALLOC_HEAP + free(job); +#endif + return 0; } diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index 9d1f4d2a04..989d156e49 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -48,6 +48,12 @@ #define SWITCH_RATIO 2 #endif +//The array of job_t may overflow the stack. +//Instead, use malloc to alloc job_t. +#if MAX_CPU_NUMBER > 210 +#define USE_ALLOC_HEAP +#endif + #ifndef SYRK_LOCAL #if !defined(LOWER) && !defined(TRANS) #define SYRK_LOCAL SYRK_UN @@ -502,7 +508,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO blas_arg_t newarg; +#ifndef USE_ALLOC_HEAP job_t job[MAX_CPU_NUMBER]; +#else + job_t * job = NULL; +#endif + blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 100]; @@ -556,6 +567,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO newarg.ldc = args -> ldc; newarg.alpha = args -> alpha; newarg.beta = args -> beta; + +#ifdef USE_ALLOC_HEAP + job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); + if(job==NULL){ + fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); + exit(1); + } +#endif + newarg.common = (void *)job; if (!range_n) { @@ -668,6 +688,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO exec_blas(num_cpu, queue); } +#ifdef USE_ALLOC_HEAP + free(job); +#endif return 0; } diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 87a32898c9..9cf297ed44 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -48,6 +48,12 @@ #define SWITCH_RATIO 2 #endif +//The array of job_t may overflow the stack. +//Instead, use malloc to alloc job_t. +#if MAX_CPU_NUMBER > 210 +#define USE_ALLOC_HEAP +#endif + #ifndef GEMM_LOCAL #if defined(NN) #define GEMM_LOCAL GEMM_NN @@ -531,7 +537,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG blas_arg_t newarg; +#ifndef USE_ALLOC_HEAP job_t job[MAX_CPU_NUMBER]; +#else + job_t * job = NULL; +#endif + blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_M[MAX_CPU_NUMBER + 1]; @@ -575,6 +586,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG newarg.alpha = args -> alpha; newarg.beta = args -> beta; newarg.nthreads = args -> nthreads; + +#ifdef USE_ALLOC_HEAP + job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); + if(job==NULL){ + fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); + exit(1); + } +#endif + newarg.common = (void *)job; #ifdef PARAMTEST @@ -660,6 +680,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG exec_blas(num_cpu_m, queue); } +#ifdef USE_ALLOC_HEAP + free(job); +#endif + return 0; } diff --git a/driver/others/memory.c b/driver/others/memory.c index 21ee93ddad..d8046d7bdd 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -217,7 +217,7 @@ int get_num_procs(void) { } return nums; } - +/* void set_stack_limit(int limitMB){ int result=0; struct rlimit rl; @@ -235,6 +235,7 @@ void set_stack_limit(int limitMB){ } } } +*/ #endif /* @@ -1273,12 +1274,6 @@ void CONSTRUCTOR gotoblas_init(void) { #endif #ifdef DYNAMIC_ARCH -#if defined(SMP) && defined(OS_DARWIN) && MAX_CPU_NUMBER > 128 - //Set stack limit to 16MB on Mac OS X - //when NUM_THREADS>128 and DYNAMIC_ARCH=1. - //Prevent the SEGFAULT bug. - set_stack_limit(16); -#endif gotoblas_dynamic_init(); #endif diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 6f6672099d..e9ce038b5a 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -43,6 +43,12 @@ static FLOAT dm1 = -1.; double sqrt(double); +//In this case, the recursive getrf_parallel may overflow the stack. +//Instead, use malloc to alloc job_t. +#if MAX_CPU_NUMBER > 90 +#define USE_ALLOC_HEAP +#endif + #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 8 #endif @@ -356,7 +362,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG range_M[MAX_CPU_NUMBER + 1]; BLASLONG range_N[MAX_CPU_NUMBER + 1]; +#ifndef USE_ALLOC_HEAP job_t job[MAX_CPU_NUMBER]; +#else + job_t * job=NULL; +#endif BLASLONG width, nn, mm; BLASLONG i, j, k, is, bk; @@ -401,7 +411,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.c = ipiv; newarg.lda = lda; - newarg.common = (void *)job; info = 0; @@ -427,6 +436,16 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (iinfo && !info) info = iinfo; +#ifdef USE_ALLOC_HEAP + job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); + if(job==NULL){ + fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); + exit(1); + } +#endif + + newarg.common = (void *)job; + TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); @@ -586,6 +605,10 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, is += bk; } +#ifdef USE_ALLOC_HEAP + free(job); +#endif + return info; } diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index f270c3d9e2..4156dc04cb 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -41,6 +41,13 @@ #ifndef USE_SIMPLE_THREADED_LEVEL3 +//The array of job_t may overflow the stack. +//Instead, use malloc to alloc job_t. +#if MAX_CPU_NUMBER > 210 +#define USE_ALLOC_HEAP +#endif + + static FLOAT dm1 = -1.; #ifndef KERNEL_FUNC @@ -342,7 +349,12 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ blas_arg_t newarg; +#ifndef USE_ALLOC_HEAP job_t job[MAX_CPU_NUMBER]; +#else + job_t * job = NULL; +#endif + blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 100]; @@ -387,6 +399,15 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ newarg.c = args -> c; newarg.lda = args -> lda; newarg.alpha = args -> alpha; + +#ifdef USE_ALLOC_HEAP + job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); + if(job==NULL){ + fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); + exit(1); + } +#endif + newarg.common = (void *)job; n_from = 0; @@ -494,6 +515,10 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ exec_blas(num_cpu, queue); } +#ifdef USE_ALLOC_HEAP + free(job); +#endif + return 0; }