From 87f8cca3e89617d26579d7d1f2ee72abdab95eb3 Mon Sep 17 00:00:00 2001 From: M Clark Date: Tue, 12 May 2015 14:43:46 -0700 Subject: [PATCH 1/7] Fixed assertion failure for GDR receive buffer with Nface > 1 for Wilson fermions. --- lib/color_spinor_field.cpp | 40 +++++++++++++------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index 97a09cf54b..382d393ec3 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -52,32 +52,20 @@ namespace quda { if (getVerbosity() == QUDA_DEBUG_VERBOSE) printfQuda("Precision = %d, Subset = %d\n", precision, siteSubset); - int num_faces = 1; - int num_norm_faces=2; - - // FIXME - this is a hack from hell that needs to be fixed. When - // the TIFR interface is enabled we are forcing naive staggered - // support which breaks asqtad/hisq fermions. The problem occurs - // because the ghost zone is allocated before we know which - // operator (and hence number of faces are needed). One solution - // may be to separate the ghost zone memory allocation from the - // field itself, which has other benefits (1. on multi-gpu - // machines with UVA, we can read the ghost zone directly from the - // neighbouring field and 2.) we can use a single contiguous - // buffer for the ghost zone and its norm which will reduce - // latency for half precision and allow us to enable GPU_COMMS - // support for half precision). -#ifdef BUILD_TIFR_INTERFACE - if (nSpin == 1) { //staggered - num_faces=2; - num_norm_faces=2; - } -#else - if (nSpin == 1) { // improved staggered - num_faces=6; - num_norm_faces=6; - } -#endif + // FIXME - The ghost zone is allocated before we know which + // operator (and hence number of faces are needed), thus we + // allocate a ghost zone large enough to cope with the maximum + // number of faces (maxNface). This can artificially raise the + // GPU memory requirements. One potential future solution may be + // to separate the ghost zone memory allocation from the field + // itself, which has other benefits (1. on multi-gpu machines with + // UVA, we can read the ghost zone directly from the neighbouring + // field and 2.) we can use a single contiguous buffer for the + // ghost zone and its norm which will reduce latency for half + // precision and allow us to enable GPU_COMMS support for half + // precision). + int num_faces = ((nSpin == 1) ? 2 : 1) * maxNface; + int num_norm_faces = 2*maxNface; // calculate size of ghost zone required int ghostVolume = 0; From 705adab3bda994327bbe495a2f2b3ab72bbf0cf2 Mon Sep 17 00:00:00 2001 From: M Clark Date: Tue, 12 May 2015 15:00:57 -0700 Subject: [PATCH 2/7] Small clean up of message handle creation in cuda_color_field.cpp and added debugging of communicator declaration in comm_common.cpp. --- lib/comm_common.cpp | 20 ++++++++++++++++++++ lib/cuda_color_spinor_field.cu | 9 +++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index e1ff605351..ab4de2ef68 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -228,6 +228,16 @@ int comm_coord(int dim) */ MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes) { +#ifdef HOST_DEBUG + cudaPointerAttributes attributes; + cudaPointerGetAttributes(&attributes, buffer); + if (attributes->memoryType == cudaMemoryTypeHost) { + memset(buffer, 0, nbytes); + } else { + assert(cudaSuccess == cudaMemset(buffer, 0, nbytes)); + } +#endif + int disp[QUDA_MAX_DIM] = {0}; disp[dim] = dir; @@ -240,6 +250,16 @@ MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nby */ MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes) { +#ifdef HOST_DEBUG + cudaPointerAttributes attributes; + cudaPointerGetAttributes(&attributes, buffer); + if (attributes->memoryType == cudaMemoryTypeHost) { + memset(buffer, 0, nbytes); + } else { + assert(cudaSuccess == cudaMemset(buffer, 0, nbytes)); + } +#endif + int disp[QUDA_MAX_DIM] = {0}; disp[dim] = dir; diff --git a/lib/cuda_color_spinor_field.cu b/lib/cuda_color_spinor_field.cu index 5611178541..adae047cf7 100644 --- a/lib/cuda_color_spinor_field.cu +++ b/lib/cuda_color_spinor_field.cu @@ -945,6 +945,7 @@ namespace quda { if (!commDimPartitioned(i)) continue; #ifdef GPU_COMMS size_t nbytes_Nface = surfaceCB[i]*Ndof*precision*(j+1); + size_t nbytes_Nface_norm = surfaceCB[i]*(j+1)*sizeof(float); if (i != 3 || getKernelPackT() || getTwistPack()) { #else size_t nbytes_Nface = (nbytes[i] / maxNface) * (j+1); @@ -959,8 +960,8 @@ namespace quda { if(precision == QUDA_HALF_PRECISION){ for(int b=0; b<2; ++b){ - mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, surfaceCB[i]*(j+1)*sizeof(float)); - mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(my_back_norm_face[b][i], i, -1, surfaceCB[i]*(j+1)*sizeof(float)); + mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm); + mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(my_back_norm_face[b][i], i, -1, nbytes_Nface_norm); mh_send_norm_fwd[b][j][2*i+1] = mh_send_norm_fwd[b][j][2*i]; mh_send_norm_back[b][j][2*i+1] = mh_send_norm_back[b][j][2*i]; } @@ -1041,8 +1042,8 @@ namespace quda { if(precision == QUDA_HALF_PRECISION){ for(int b=0; b<2; ++b){ - mh_recv_norm_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, surfaceCB[i]*sizeof(float)*(j+1)); - mh_recv_norm_back[b][j][i] = comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, surfaceCB[i]*sizeof(float)*(j+1)); + mh_recv_norm_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm); + mh_recv_norm_back[b][j][i] = comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, nbytes_Nface_norm); } } #endif // GPU_COMMS From b03757e30fc35333da5a3fa69e996a5de5493b5b Mon Sep 17 00:00:00 2001 From: M Clark Date: Tue, 12 May 2015 21:43:59 -0700 Subject: [PATCH 3/7] Error checking for comms send buffers must be non-destructive. Added CPU comms buffer checking using std::fill and std::copy. --- include/comm_quda.h | 20 +++++++++++++---- lib/comm_common.cpp | 52 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 57 insertions(+), 15 deletions(-) diff --git a/include/comm_quda.h b/include/comm_quda.h index 5deffc865a..09cce216ef 100644 --- a/include/comm_quda.h +++ b/include/comm_quda.h @@ -29,22 +29,34 @@ extern "C" { int comm_coord(int dim); /** - Create a persistent message handler for a relative send + Create a persistent message handler for a relative send. This + should not be called directly, and instead the helper macro + (without the trailing underscore) should be called instead. @param buffer Buffer from which message will be sent @param dim Dimension in which message will be sent @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards) @param nbytes Size of message in bytes */ - MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes); + MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, size_t nbytes); + +#define comm_declare_send_relative(buffer, dim, dir, nbytes) \ + comm_declare_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes) /** - Create a persistent message handler for a relative receive + Create a persistent message handler for a relative send. This + should not be called directly, and instead the helper macro + (without the trailing underscore) should be called instead. @param buffer Buffer into which message will be received @param dim Dimension from message will be received @param dir Direction from messaged with be recived (0 - backwards, 1 forwards) @param nbytes Size of message in bytes */ - MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes); + MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, size_t nbytes); + +#define comm_declare_receive_relative(buffer, dim, dir, nbytes) \ + comm_declare_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes) /** Create a persistent strided message handler for a relative send diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index ab4de2ef68..b2a9454b6f 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -1,4 +1,5 @@ #include // for gethostname() +#include #include #include @@ -226,15 +227,32 @@ int comm_coord(int dim) /** * Send to the "dir" direction in the "dim" dimension */ -MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes) +MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, size_t nbytes) { #ifdef HOST_DEBUG cudaPointerAttributes attributes; - cudaPointerGetAttributes(&attributes, buffer); - if (attributes->memoryType == cudaMemoryTypeHost) { - memset(buffer, 0, nbytes); + cudaError_t err = cudaPointerGetAttributes(&attributes, buffer); + if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) { + // test this memory allocation is ok by doing a memcpy from it + void *tmp = safe_malloc(nbytes); + try { + std::copy(static_cast(buffer), static_cast(buffer)+nbytes, static_cast(tmp)); + } catch(std::exception &e) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes); + errorQuda("Aborting"); + } + if (err != cudaSuccess) cudaGetLastError(); + host_free(tmp); } else { - assert(cudaSuccess == cudaMemset(buffer, 0, nbytes)); + // test this memory allocation is ok by doing a memcpy from it + void *tmp = device_malloc(nbytes); + cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice); + if (err != cudaSuccess) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes); + errorQuda("Aborting with error %s", cudaGetErrorString(err)); + } + device_free(tmp); } #endif @@ -244,19 +262,31 @@ MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nby return comm_declare_send_displaced(buffer, disp, nbytes); } - /** * Receive from the "dir" direction in the "dim" dimension */ -MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes) +MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, size_t nbytes) { #ifdef HOST_DEBUG cudaPointerAttributes attributes; - cudaPointerGetAttributes(&attributes, buffer); - if (attributes->memoryType == cudaMemoryTypeHost) { - memset(buffer, 0, nbytes); + cudaError_t err = cudaPointerGetAttributes(&attributes, buffer); + if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) { + // test this memory allocation is ok by filling it + try { + std::fill(static_cast(buffer), static_cast(buffer)+nbytes, 0); + } catch(std::exception &e) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes); + errorQuda("Aborting"); + } + if (err != cudaSuccess) cudaGetLastError(); } else { - assert(cudaSuccess == cudaMemset(buffer, 0, nbytes)); + // test this memory allocation is ok by doing a memset + cudaError_t err = cudaMemset(buffer, 0, nbytes); + if (err != cudaSuccess) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes); + errorQuda("Aborting with error %s", cudaGetErrorString(err)); + } } #endif From 3b56ea3f982afbdf1a6cff668573fb0bc26d209c Mon Sep 17 00:00:00 2001 From: M Clark Date: Wed, 13 May 2015 10:36:21 -0700 Subject: [PATCH 4/7] In cudaColorSpinorField::createComms, only allocate the send message handlers for the requested number of faces. --- lib/cuda_color_spinor_field.cu | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/lib/cuda_color_spinor_field.cu b/lib/cuda_color_spinor_field.cu index adae047cf7..89ffe8d2b4 100644 --- a/lib/cuda_color_spinor_field.cu +++ b/lib/cuda_color_spinor_field.cu @@ -552,7 +552,6 @@ namespace quda { // only allocate if not already allocated or buffer required is bigger than previously if(initGhostFaceBuffer == 0 || faceBytes > ghostFaceBytes){ - if (initGhostFaceBuffer){ for(int b=0; b<2; ++b) device_free(ghostFaceBuffer[b]); } @@ -951,8 +950,8 @@ namespace quda { size_t nbytes_Nface = (nbytes[i] / maxNface) * (j+1); #endif for(int b=0; b<2; ++b){ - mh_send_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_face[b][i], i, +1, nbytes_Nface); - mh_send_back[b][j][2*i+0] = comm_declare_send_relative(my_back_face[b][i], i, -1, nbytes_Nface); + mh_send_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_fwd_face[b][i], i, +1, nbytes_Nface) : NULL; + mh_send_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_back_face[b][i], i, -1, nbytes_Nface) : NULL; mh_send_fwd[b][j][2*i+1] = mh_send_fwd[b][j][2*i]; // alias pointers mh_send_back[b][j][2*i+1] = mh_send_back[b][j][2*i]; // alias pointers } @@ -960,8 +959,8 @@ namespace quda { if(precision == QUDA_HALF_PRECISION){ for(int b=0; b<2; ++b){ - mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm); - mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(my_back_norm_face[b][i], i, -1, nbytes_Nface_norm); + mh_send_norm_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm) : NULL; + mh_send_norm_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_back_norm_face[b][i], i, -1, nbytes_Nface_norm) : NULL; mh_send_norm_fwd[b][j][2*i+1] = mh_send_norm_fwd[b][j][2*i]; mh_send_norm_back[b][j][2*i+1] = mh_send_norm_back[b][j][2*i]; } @@ -1013,11 +1012,12 @@ namespace quda { //printf("%d strided sends with Nface=%d Nblocks=%d blksize=%d Stride=%d\n", i, j+1, Nblocks, blksize, Stride); for(int b=0; b<2; ++b){ - mh_send_fwd[b][j][2*i+0] = comm_declare_strided_send_relative(base[2], i, +1, blksize, Nblocks, Stride); - mh_send_back[b][j][2*i+0] = comm_declare_strided_send_relative(base[0], i, -1, blksize, Nblocks, Stride); + // only allocate a communicator for the present face (this needs cleaned up) + mh_send_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[2], i, +1, blksize, Nblocks, Stride) : NULL; + mh_send_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[0], i, -1, blksize, Nblocks, Stride) : NULL; if (nSpin ==4) { // dagger communicators - mh_send_fwd[b][j][2*i+1] = comm_declare_strided_send_relative(base[3], i, +1, blksize, Nblocks, Stride); - mh_send_back[b][j][2*i+1] = comm_declare_strided_send_relative(base[1], i, -1, blksize, Nblocks, Stride); + mh_send_fwd[b][j][2*i+1] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[3], i, +1, blksize, Nblocks, Stride) : NULL; + mh_send_back[b][j][2*i+1] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[1], i, -1, blksize, Nblocks, Stride) : NULL; } else { mh_send_fwd[b][j][2*i+1] = mh_send_fwd[b][j][2*i+0]; mh_send_back[b][j][2*i+1] = mh_send_back[b][j][2*i+0]; @@ -1031,8 +1031,8 @@ namespace quda { void *norm_fwd = static_cast(norm) + Nt_minus1_offset; void *norm_back = norm; // the first time slice has zero offset for(int b=0; b<2; ++b){ - mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(norm_fwd, i, +1, surfaceCB[i]*(j+1)*sizeof(float)); - mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(norm_back, i, -1, surfaceCB[i]*(j+1)*sizeof(float)); + mh_send_norm_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(norm_fwd, i, +1, surfaceCB[i]*(j+1)*sizeof(float)) : NULL; + mh_send_norm_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(norm_back, i, -1, surfaceCB[i]*(j+1)*sizeof(float)) : NULL; mh_send_norm_fwd[b][j][2*i+1] = mh_send_norm_fwd[b][j][2*i]; mh_send_norm_back[b][j][2*i+1] = mh_send_norm_back[b][j][2*i]; } @@ -1073,20 +1073,20 @@ namespace quda { if (commDimPartitioned(i)) { comm_free(mh_recv_fwd[b][j][i]); comm_free(mh_recv_back[b][j][i]); - comm_free(mh_send_fwd[b][j][2*i]); - comm_free(mh_send_back[b][j][2*i]); + if (mh_send_fwd[b][j][2*i]) comm_free(mh_send_fwd[b][j][2*i]); + if (mh_send_back[b][j][2*i]) comm_free(mh_send_back[b][j][2*i]); // only in a special case are these not aliasing pointers #ifdef GPU_COMMS if(precision == QUDA_HALF_PRECISION){ comm_free(mh_recv_norm_fwd[b][j][i]); comm_free(mh_recv_norm_back[b][j][i]); - comm_free(mh_send_norm_fwd[b][j][2*i]); - comm_free(mh_send_norm_back[b][j][2*i]); + if (mh_send_norm_fwd[b][j][2*i]) comm_free(mh_send_norm_fwd[b][j][2*i]); + if (mh_send_norm_back[b][j][2*i]) comm_free(mh_send_norm_back[b][j][2*i]); } if (i == 3 && !getKernelPackT() && nSpin == 4) { - comm_free(mh_send_fwd[b][j][2*i+1]); - comm_free(mh_send_back[b][j][2*i+1]); + if (mh_send_fwd[b][j][2*i+1]) comm_free(mh_send_fwd[b][j][2*i+1]); + if (mh_send_back[b][j][2*i+1]) comm_free(mh_send_back[b][j][2*i+1]); } #endif // GPU_COMMS } From 36292b4e8b5ea665d25556104a85aea9ce4e733a Mon Sep 17 00:00:00 2001 From: M Clark Date: Wed, 13 May 2015 12:05:38 -0700 Subject: [PATCH 5/7] Added buffer validity checking for strided message handlers in lib/comm_common.cpp. --- include/comm_quda.h | 22 ++++++++++---- lib/comm_common.cpp | 70 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 79 insertions(+), 13 deletions(-) diff --git a/include/comm_quda.h b/include/comm_quda.h index 09cce216ef..06bbac2578 100644 --- a/include/comm_quda.h +++ b/include/comm_quda.h @@ -59,7 +59,9 @@ extern "C" { comm_declare_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes) /** - Create a persistent strided message handler for a relative send + Create a persistent strided message handler for a relative send. + This should not be called directly, and instead the helper macro + (without the trailing underscore) should be called instead. @param buffer Buffer from which message will be sent @param dim Dimension in which message will be sent @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards) @@ -67,11 +69,17 @@ extern "C" { @param nblocks Number of blocks @param stride Stride between blocks in bytes */ - MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, - size_t blksize, int nblocks, size_t stride); + MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, + size_t blksize, int nblocks, size_t stride); + +#define comm_declare_strided_send_relative(buffer, dim, dir, blksize, nblocks, stride) \ + comm_declare_strided_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride) /** Create a persistent strided message handler for a relative receive + This should not be called directly, and instead the helper macro + (without the trailing underscore) should be called instead. @param buffer Buffer into which message will be received @param dim Dimension from message will be received @param dir Direction from messaged with be recived (0 - backwards, 1 forwards) @@ -79,8 +87,12 @@ extern "C" { @param nblocks Number of blocks @param stride Stride between blocks in bytes */ - MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir, - size_t blksize, int nblocks, size_t stride); + MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, + size_t blksize, int nblocks, size_t stride); + +#define comm_declare_strided_receive_relative(buffer, dim, dir, blksize, nblocks, stride) \ + comm_declare_strided_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride) void comm_finalize(void); void comm_dim_partitioned_set(int dim); diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index b2a9454b6f..9285a07209 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -240,7 +240,7 @@ MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int l std::copy(static_cast(buffer), static_cast(buffer)+nbytes, static_cast(tmp)); } catch(std::exception &e) { printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes); - errorQuda("Aborting"); + errorQuda("aborting"); } if (err != cudaSuccess) cudaGetLastError(); host_free(tmp); @@ -250,7 +250,7 @@ MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int l cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice); if (err != cudaSuccess) { printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes); - errorQuda("Aborting with error %s", cudaGetErrorString(err)); + errorQuda("aborting with error %s", cudaGetErrorString(err)); } device_free(tmp); } @@ -277,7 +277,7 @@ MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, in std::fill(static_cast(buffer), static_cast(buffer)+nbytes, 0); } catch(std::exception &e) { printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes); - errorQuda("Aborting"); + errorQuda("aborting"); } if (err != cudaSuccess) cudaGetLastError(); } else { @@ -285,7 +285,7 @@ MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, in cudaError_t err = cudaMemset(buffer, 0, nbytes); if (err != cudaSuccess) { printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes); - errorQuda("Aborting with error %s", cudaGetErrorString(err)); + errorQuda("aborting with error %s", cudaGetErrorString(err)); } } #endif @@ -299,9 +299,38 @@ MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, in /** * Strided send to the "dir" direction in the "dim" dimension */ -MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, - size_t blksize, int nblocks, size_t stride) +MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride) { +#ifdef HOST_DEBUG + cudaPointerAttributes attributes; + cudaError_t err = cudaPointerGetAttributes(&attributes, buffer); + if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) { + // test this memory allocation is ok by doing a memcpy from it + void *tmp = safe_malloc(blksize*nblocks); + try { + for (int i=0; i(buffer)+i*stride, static_cast(buffer)+i*stride+blksize, static_cast(tmp)); + } catch(std::exception &e) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n", + file, line, func, dim, dir, blksize, nblocks, stride); + errorQuda("aborting"); + } + host_free(tmp); + if (err != cudaSuccess) cudaGetLastError(); + } else { + // test this memory allocation is ok by doing a memcpy from it + void *tmp = device_malloc(blksize*nblocks); + cudaError_t err = cudaMemcpy2D(tmp, blksize, buffer, stride, blksize, nblocks, cudaMemcpyDeviceToDevice); + if (err != cudaSuccess) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n", + file, line, func, dim, dir, blksize, nblocks, stride); + errorQuda("aborting with error %s", cudaGetErrorString(err)); + } + device_free(tmp); + } +#endif + int disp[QUDA_MAX_DIM] = {0}; disp[dim] = dir; @@ -312,9 +341,34 @@ MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, /** * Strided receive from the "dir" direction in the "dim" dimension */ -MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir, - size_t blksize, int nblocks, size_t stride) +MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride) { +#ifdef HOST_DEBUG + cudaPointerAttributes attributes; + cudaError_t err = cudaPointerGetAttributes(&attributes, buffer); + if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) { + // test this memory allocation is ok by filling it + try { + for (int i=0; i(buffer)+i*stride, static_cast(buffer)+i*stride+blksize, 0); + } catch(std::exception &e) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n", + file, line, func, dim, dir, blksize, nblocks, stride); + errorQuda("aborting"); + } + if (err != cudaSuccess) cudaGetLastError(); + } else { + // test this memory allocation is ok by doing a memset + cudaError_t err = cudaMemset2D(buffer, stride, 0, blksize, nblocks); + if (err != cudaSuccess) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n", + file, line, func, dim, dir, blksize, nblocks, stride); + errorQuda("aborting with error %s", cudaGetErrorString(err)); + } + } +#endif + int disp[QUDA_MAX_DIM] = {0}; disp[dim] = dir; From 634d17075bb4f13b397bad0209cf686df791ccdd Mon Sep 17 00:00:00 2001 From: M Clark Date: Wed, 13 May 2015 12:33:45 -0700 Subject: [PATCH 6/7] Only allocate receive messahe handlers for the requested number of faces. --- lib/cuda_color_spinor_field.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/cuda_color_spinor_field.cu b/lib/cuda_color_spinor_field.cu index 89ffe8d2b4..3e1bdb1581 100644 --- a/lib/cuda_color_spinor_field.cu +++ b/lib/cuda_color_spinor_field.cu @@ -1042,15 +1042,15 @@ namespace quda { if(precision == QUDA_HALF_PRECISION){ for(int b=0; b<2; ++b){ - mh_recv_norm_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm); - mh_recv_norm_back[b][j][i] = comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, nbytes_Nface_norm); + mh_recv_norm_fwd[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm) : NULL; + mh_recv_norm_back[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, nbytes_Nface_norm) : NULL; } } #endif // GPU_COMMS for(int b=0; b<2; ++b){ - mh_recv_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_face[b][i], i, +1, nbytes_Nface); - mh_recv_back[b][j][i] = comm_declare_receive_relative(from_back_face[b][i], i, -1, nbytes_Nface); + mh_recv_fwd[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_fwd_face[b][i], i, +1, nbytes_Nface) : NULL; + mh_recv_back[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_back_face[b][i], i, -1, nbytes_Nface) : NULL; } @@ -1071,15 +1071,15 @@ namespace quda { for (int j=0; j Date: Wed, 13 May 2015 12:35:19 -0700 Subject: [PATCH 7/7] Always allocate a ghost zone of one for Wilson-like and three for staggered fermions. --- lib/color_spinor_field.cpp | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index 382d393ec3..6cee431084 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -55,17 +55,24 @@ namespace quda { // FIXME - The ghost zone is allocated before we know which // operator (and hence number of faces are needed), thus we // allocate a ghost zone large enough to cope with the maximum - // number of faces (maxNface). This can artificially raise the - // GPU memory requirements. One potential future solution may be - // to separate the ghost zone memory allocation from the field - // itself, which has other benefits (1. on multi-gpu machines with - // UVA, we can read the ghost zone directly from the neighbouring - // field and 2.) we can use a single contiguous buffer for the - // ghost zone and its norm which will reduce latency for half - // precision and allow us to enable GPU_COMMS support for half - // precision). - int num_faces = ((nSpin == 1) ? 2 : 1) * maxNface; - int num_norm_faces = 2*maxNface; + // number of faces. All Wilson-like operators support only + // involve the excahnge of one face so this is no problem. + // However, for staggered fermions, we have either nFace=1 or 3, + // thus we allocated using the latter. This will artificially + // raise the GPU memory requirements for naive staggered fermions. + // One potential future solution may be to separate the ghost zone + // memory allocation from the field itself, which has other + // benefits (1. on multi-gpu machines with UVA, we can read the + // ghost zone directly from the neighbouring field and 2.) we can + // use a single contiguous buffer for the ghost zone and its norm + // which will reduce latency for half precision and allow us to + // enable GPU_COMMS support for half precision). + int nFaceGhost = (nSpin == 1) ? 3 : 1; + + // For Wilson we have the number of effective faces since the + // fields are spin projected. + int num_faces = ((nSpin == 1) ? 2 : 1) * nFaceGhost; + int num_norm_faces = 2*nFaceGhost; // calculate size of ghost zone required int ghostVolume = 0;