Skip to content

Commit

Permalink
Refs #221 #246. Fixed the overflowing stack bug in mutlithreading BLAS3.
Browse files Browse the repository at this point in the history
When NUM_THREADS(MAX_CPU_NUNBERS) is very large ,e.g. 256.

typedef struct {
  volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;

job_t          job[MAX_CPU_NUMBER];

The job array is equal 8MB.

Thus, We use malloc instead of stack allocation.
  • Loading branch information
xianyi committed Jul 7, 2013
1 parent 886cbaf commit 5d33121
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 9 deletions.
25 changes: 24 additions & 1 deletion driver/level3/level3_gemm3m_thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@
#define SWITCH_RATIO 2
#endif

//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > 210
#define USE_ALLOC_HEAP
#endif

#ifndef GEMM3M_LOCAL
#if defined(NN)
#define GEMM3M_LOCAL GEMM3M_NN
Expand Down Expand Up @@ -836,7 +842,11 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
BLASLONG range_M[MAX_CPU_NUMBER + 1];
BLASLONG range_N[MAX_CPU_NUMBER + 1];

job_t job[MAX_CPU_NUMBER];
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif

BLASLONG num_cpu_m, num_cpu_n;

Expand Down Expand Up @@ -866,6 +876,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
newarg.nthreads = args -> nthreads;

#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif

newarg.common = (void *)job;

if (!range_m) {
Expand Down Expand Up @@ -945,6 +964,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
exec_blas(num_cpu_m, queue);
}

#ifdef USE_ALLOC_HEAP
free(job);
#endif

return 0;
}

Expand Down
23 changes: 23 additions & 0 deletions driver/level3/level3_syrk_threaded.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@
#define SWITCH_RATIO 2
#endif

//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > 210
#define USE_ALLOC_HEAP
#endif

#ifndef SYRK_LOCAL
#if !defined(LOWER) && !defined(TRANS)
#define SYRK_LOCAL SYRK_UN
Expand Down Expand Up @@ -502,7 +508,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO

blas_arg_t newarg;

#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif

blas_queue_t queue[MAX_CPU_NUMBER];

BLASLONG range[MAX_CPU_NUMBER + 100];
Expand Down Expand Up @@ -556,6 +567,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
newarg.ldc = args -> ldc;
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;

#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif

newarg.common = (void *)job;

if (!range_n) {
Expand Down Expand Up @@ -668,6 +688,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
exec_blas(num_cpu, queue);
}

#ifdef USE_ALLOC_HEAP
free(job);
#endif

return 0;
}
24 changes: 24 additions & 0 deletions driver/level3/level3_thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@
#define SWITCH_RATIO 2
#endif

//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > 210
#define USE_ALLOC_HEAP
#endif

#ifndef GEMM_LOCAL
#if defined(NN)
#define GEMM_LOCAL GEMM_NN
Expand Down Expand Up @@ -531,7 +537,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG

blas_arg_t newarg;

#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif

blas_queue_t queue[MAX_CPU_NUMBER];

BLASLONG range_M[MAX_CPU_NUMBER + 1];
Expand Down Expand Up @@ -575,6 +586,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
newarg.nthreads = args -> nthreads;

#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif

newarg.common = (void *)job;

#ifdef PARAMTEST
Expand Down Expand Up @@ -660,6 +680,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
exec_blas(num_cpu_m, queue);
}

#ifdef USE_ALLOC_HEAP
free(job);
#endif

return 0;
}

Expand Down
9 changes: 2 additions & 7 deletions driver/others/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ int get_num_procs(void) {
}
return nums;
}

/*
void set_stack_limit(int limitMB){
int result=0;
struct rlimit rl;
Expand All @@ -235,6 +235,7 @@ void set_stack_limit(int limitMB){
}
}
}
*/
#endif

/*
Expand Down Expand Up @@ -1273,12 +1274,6 @@ void CONSTRUCTOR gotoblas_init(void) {
#endif

#ifdef DYNAMIC_ARCH
#if defined(SMP) && defined(OS_DARWIN) && MAX_CPU_NUMBER > 128
//Set stack limit to 16MB on Mac OS X
//when NUM_THREADS>128 and DYNAMIC_ARCH=1.
//Prevent the SEGFAULT bug.
set_stack_limit(16);
#endif
gotoblas_dynamic_init();
#endif

Expand Down
25 changes: 24 additions & 1 deletion lapack/getrf/getrf_parallel.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ static FLOAT dm1 = -1.;

double sqrt(double);

//In this case, the recursive getrf_parallel may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > 90
#define USE_ALLOC_HEAP
#endif

#ifndef CACHE_LINE_SIZE
#define CACHE_LINE_SIZE 8
#endif
Expand Down Expand Up @@ -356,7 +362,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
BLASLONG range_M[MAX_CPU_NUMBER + 1];
BLASLONG range_N[MAX_CPU_NUMBER + 1];

#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job=NULL;
#endif

BLASLONG width, nn, mm;
BLASLONG i, j, k, is, bk;
Expand Down Expand Up @@ -401,7 +411,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,

newarg.c = ipiv;
newarg.lda = lda;
newarg.common = (void *)job;

info = 0;

Expand All @@ -427,6 +436,16 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,

if (iinfo && !info) info = iinfo;

#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif

newarg.common = (void *)job;

TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);

sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
Expand Down Expand Up @@ -586,6 +605,10 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
is += bk;
}

#ifdef USE_ALLOC_HEAP
free(job);
#endif

return info;
}

Expand Down
25 changes: 25 additions & 0 deletions lapack/potrf/potrf_parallel.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@

#ifndef USE_SIMPLE_THREADED_LEVEL3

//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > 210
#define USE_ALLOC_HEAP
#endif


static FLOAT dm1 = -1.;

#ifndef KERNEL_FUNC
Expand Down Expand Up @@ -342,7 +349,12 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){

blas_arg_t newarg;

#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif

blas_queue_t queue[MAX_CPU_NUMBER];

BLASLONG range[MAX_CPU_NUMBER + 100];
Expand Down Expand Up @@ -387,6 +399,15 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
newarg.c = args -> c;
newarg.lda = args -> lda;
newarg.alpha = args -> alpha;

#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif

newarg.common = (void *)job;

n_from = 0;
Expand Down Expand Up @@ -494,6 +515,10 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
exec_blas(num_cpu, queue);
}

#ifdef USE_ALLOC_HEAP
free(job);
#endif

return 0;
}

Expand Down

1 comment on commit 5d33121

@ViralBShah
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not always allocate on the heap?

Please sign in to comment.