diff --git a/include/comm_quda.h b/include/comm_quda.h index 5deffc865a..06bbac2578 100644 --- a/include/comm_quda.h +++ b/include/comm_quda.h @@ -29,25 +29,39 @@ extern "C" { int comm_coord(int dim); /** - Create a persistent message handler for a relative send + Create a persistent message handler for a relative send. This + should not be called directly, and instead the helper macro + (without the trailing underscore) should be called instead. @param buffer Buffer from which message will be sent @param dim Dimension in which message will be sent @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards) @param nbytes Size of message in bytes */ - MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes); + MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, size_t nbytes); + +#define comm_declare_send_relative(buffer, dim, dir, nbytes) \ + comm_declare_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes) /** - Create a persistent message handler for a relative receive + Create a persistent message handler for a relative send. This + should not be called directly, and instead the helper macro + (without the trailing underscore) should be called instead. @param buffer Buffer into which message will be received @param dim Dimension from message will be received @param dir Direction from messaged with be recived (0 - backwards, 1 forwards) @param nbytes Size of message in bytes */ - MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes); + MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, size_t nbytes); + +#define comm_declare_receive_relative(buffer, dim, dir, nbytes) \ + comm_declare_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes) /** - Create a persistent strided message handler for a relative send + Create a persistent strided message handler for a relative send. + This should not be called directly, and instead the helper macro + (without the trailing underscore) should be called instead. @param buffer Buffer from which message will be sent @param dim Dimension in which message will be sent @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards) @@ -55,11 +69,17 @@ extern "C" { @param nblocks Number of blocks @param stride Stride between blocks in bytes */ - MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, - size_t blksize, int nblocks, size_t stride); + MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, + size_t blksize, int nblocks, size_t stride); + +#define comm_declare_strided_send_relative(buffer, dim, dir, blksize, nblocks, stride) \ + comm_declare_strided_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride) /** Create a persistent strided message handler for a relative receive + This should not be called directly, and instead the helper macro + (without the trailing underscore) should be called instead. @param buffer Buffer into which message will be received @param dim Dimension from message will be received @param dir Direction from messaged with be recived (0 - backwards, 1 forwards) @@ -67,8 +87,12 @@ extern "C" { @param nblocks Number of blocks @param stride Stride between blocks in bytes */ - MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir, - size_t blksize, int nblocks, size_t stride); + MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, + size_t blksize, int nblocks, size_t stride); + +#define comm_declare_strided_receive_relative(buffer, dim, dir, blksize, nblocks, stride) \ + comm_declare_strided_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride) void comm_finalize(void); void comm_dim_partitioned_set(int dim); diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index 97a09cf54b..6cee431084 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -52,32 +52,27 @@ namespace quda { if (getVerbosity() == QUDA_DEBUG_VERBOSE) printfQuda("Precision = %d, Subset = %d\n", precision, siteSubset); - int num_faces = 1; - int num_norm_faces=2; - - // FIXME - this is a hack from hell that needs to be fixed. When - // the TIFR interface is enabled we are forcing naive staggered - // support which breaks asqtad/hisq fermions. The problem occurs - // because the ghost zone is allocated before we know which - // operator (and hence number of faces are needed). One solution - // may be to separate the ghost zone memory allocation from the - // field itself, which has other benefits (1. on multi-gpu - // machines with UVA, we can read the ghost zone directly from the - // neighbouring field and 2.) we can use a single contiguous - // buffer for the ghost zone and its norm which will reduce - // latency for half precision and allow us to enable GPU_COMMS - // support for half precision). -#ifdef BUILD_TIFR_INTERFACE - if (nSpin == 1) { //staggered - num_faces=2; - num_norm_faces=2; - } -#else - if (nSpin == 1) { // improved staggered - num_faces=6; - num_norm_faces=6; - } -#endif + // FIXME - The ghost zone is allocated before we know which + // operator (and hence number of faces are needed), thus we + // allocate a ghost zone large enough to cope with the maximum + // number of faces. All Wilson-like operators support only + // involve the excahnge of one face so this is no problem. + // However, for staggered fermions, we have either nFace=1 or 3, + // thus we allocated using the latter. This will artificially + // raise the GPU memory requirements for naive staggered fermions. + // One potential future solution may be to separate the ghost zone + // memory allocation from the field itself, which has other + // benefits (1. on multi-gpu machines with UVA, we can read the + // ghost zone directly from the neighbouring field and 2.) we can + // use a single contiguous buffer for the ghost zone and its norm + // which will reduce latency for half precision and allow us to + // enable GPU_COMMS support for half precision). + int nFaceGhost = (nSpin == 1) ? 3 : 1; + + // For Wilson we have the number of effective faces since the + // fields are spin projected. + int num_faces = ((nSpin == 1) ? 2 : 1) * nFaceGhost; + int num_norm_faces = 2*nFaceGhost; // calculate size of ghost zone required int ghostVolume = 0; diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index e1ff605351..9285a07209 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -1,4 +1,5 @@ #include // for gethostname() +#include #include #include @@ -226,20 +227,69 @@ int comm_coord(int dim) /** * Send to the "dir" direction in the "dim" dimension */ -MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes) +MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, size_t nbytes) { +#ifdef HOST_DEBUG + cudaPointerAttributes attributes; + cudaError_t err = cudaPointerGetAttributes(&attributes, buffer); + if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) { + // test this memory allocation is ok by doing a memcpy from it + void *tmp = safe_malloc(nbytes); + try { + std::copy(static_cast(buffer), static_cast(buffer)+nbytes, static_cast(tmp)); + } catch(std::exception &e) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes); + errorQuda("aborting"); + } + if (err != cudaSuccess) cudaGetLastError(); + host_free(tmp); + } else { + // test this memory allocation is ok by doing a memcpy from it + void *tmp = device_malloc(nbytes); + cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice); + if (err != cudaSuccess) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes); + errorQuda("aborting with error %s", cudaGetErrorString(err)); + } + device_free(tmp); + } +#endif + int disp[QUDA_MAX_DIM] = {0}; disp[dim] = dir; return comm_declare_send_displaced(buffer, disp, nbytes); } - /** * Receive from the "dir" direction in the "dim" dimension */ -MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes) +MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, size_t nbytes) { +#ifdef HOST_DEBUG + cudaPointerAttributes attributes; + cudaError_t err = cudaPointerGetAttributes(&attributes, buffer); + if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) { + // test this memory allocation is ok by filling it + try { + std::fill(static_cast(buffer), static_cast(buffer)+nbytes, 0); + } catch(std::exception &e) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes); + errorQuda("aborting"); + } + if (err != cudaSuccess) cudaGetLastError(); + } else { + // test this memory allocation is ok by doing a memset + cudaError_t err = cudaMemset(buffer, 0, nbytes); + if (err != cudaSuccess) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes); + errorQuda("aborting with error %s", cudaGetErrorString(err)); + } + } +#endif + int disp[QUDA_MAX_DIM] = {0}; disp[dim] = dir; @@ -249,9 +299,38 @@ MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t /** * Strided send to the "dir" direction in the "dim" dimension */ -MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, - size_t blksize, int nblocks, size_t stride) +MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride) { +#ifdef HOST_DEBUG + cudaPointerAttributes attributes; + cudaError_t err = cudaPointerGetAttributes(&attributes, buffer); + if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) { + // test this memory allocation is ok by doing a memcpy from it + void *tmp = safe_malloc(blksize*nblocks); + try { + for (int i=0; i(buffer)+i*stride, static_cast(buffer)+i*stride+blksize, static_cast(tmp)); + } catch(std::exception &e) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n", + file, line, func, dim, dir, blksize, nblocks, stride); + errorQuda("aborting"); + } + host_free(tmp); + if (err != cudaSuccess) cudaGetLastError(); + } else { + // test this memory allocation is ok by doing a memcpy from it + void *tmp = device_malloc(blksize*nblocks); + cudaError_t err = cudaMemcpy2D(tmp, blksize, buffer, stride, blksize, nblocks, cudaMemcpyDeviceToDevice); + if (err != cudaSuccess) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n", + file, line, func, dim, dir, blksize, nblocks, stride); + errorQuda("aborting with error %s", cudaGetErrorString(err)); + } + device_free(tmp); + } +#endif + int disp[QUDA_MAX_DIM] = {0}; disp[dim] = dir; @@ -262,9 +341,34 @@ MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, /** * Strided receive from the "dir" direction in the "dim" dimension */ -MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir, - size_t blksize, int nblocks, size_t stride) +MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line, + void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride) { +#ifdef HOST_DEBUG + cudaPointerAttributes attributes; + cudaError_t err = cudaPointerGetAttributes(&attributes, buffer); + if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) { + // test this memory allocation is ok by filling it + try { + for (int i=0; i(buffer)+i*stride, static_cast(buffer)+i*stride+blksize, 0); + } catch(std::exception &e) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n", + file, line, func, dim, dir, blksize, nblocks, stride); + errorQuda("aborting"); + } + if (err != cudaSuccess) cudaGetLastError(); + } else { + // test this memory allocation is ok by doing a memset + cudaError_t err = cudaMemset2D(buffer, stride, 0, blksize, nblocks); + if (err != cudaSuccess) { + printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n", + file, line, func, dim, dir, blksize, nblocks, stride); + errorQuda("aborting with error %s", cudaGetErrorString(err)); + } + } +#endif + int disp[QUDA_MAX_DIM] = {0}; disp[dim] = dir; diff --git a/lib/cuda_color_spinor_field.cu b/lib/cuda_color_spinor_field.cu index 5611178541..3e1bdb1581 100644 --- a/lib/cuda_color_spinor_field.cu +++ b/lib/cuda_color_spinor_field.cu @@ -552,7 +552,6 @@ namespace quda { // only allocate if not already allocated or buffer required is bigger than previously if(initGhostFaceBuffer == 0 || faceBytes > ghostFaceBytes){ - if (initGhostFaceBuffer){ for(int b=0; b<2; ++b) device_free(ghostFaceBuffer[b]); } @@ -945,13 +944,14 @@ namespace quda { if (!commDimPartitioned(i)) continue; #ifdef GPU_COMMS size_t nbytes_Nface = surfaceCB[i]*Ndof*precision*(j+1); + size_t nbytes_Nface_norm = surfaceCB[i]*(j+1)*sizeof(float); if (i != 3 || getKernelPackT() || getTwistPack()) { #else size_t nbytes_Nface = (nbytes[i] / maxNface) * (j+1); #endif for(int b=0; b<2; ++b){ - mh_send_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_face[b][i], i, +1, nbytes_Nface); - mh_send_back[b][j][2*i+0] = comm_declare_send_relative(my_back_face[b][i], i, -1, nbytes_Nface); + mh_send_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_fwd_face[b][i], i, +1, nbytes_Nface) : NULL; + mh_send_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_back_face[b][i], i, -1, nbytes_Nface) : NULL; mh_send_fwd[b][j][2*i+1] = mh_send_fwd[b][j][2*i]; // alias pointers mh_send_back[b][j][2*i+1] = mh_send_back[b][j][2*i]; // alias pointers } @@ -959,8 +959,8 @@ namespace quda { if(precision == QUDA_HALF_PRECISION){ for(int b=0; b<2; ++b){ - mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, surfaceCB[i]*(j+1)*sizeof(float)); - mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(my_back_norm_face[b][i], i, -1, surfaceCB[i]*(j+1)*sizeof(float)); + mh_send_norm_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm) : NULL; + mh_send_norm_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_back_norm_face[b][i], i, -1, nbytes_Nface_norm) : NULL; mh_send_norm_fwd[b][j][2*i+1] = mh_send_norm_fwd[b][j][2*i]; mh_send_norm_back[b][j][2*i+1] = mh_send_norm_back[b][j][2*i]; } @@ -1012,11 +1012,12 @@ namespace quda { //printf("%d strided sends with Nface=%d Nblocks=%d blksize=%d Stride=%d\n", i, j+1, Nblocks, blksize, Stride); for(int b=0; b<2; ++b){ - mh_send_fwd[b][j][2*i+0] = comm_declare_strided_send_relative(base[2], i, +1, blksize, Nblocks, Stride); - mh_send_back[b][j][2*i+0] = comm_declare_strided_send_relative(base[0], i, -1, blksize, Nblocks, Stride); + // only allocate a communicator for the present face (this needs cleaned up) + mh_send_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[2], i, +1, blksize, Nblocks, Stride) : NULL; + mh_send_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[0], i, -1, blksize, Nblocks, Stride) : NULL; if (nSpin ==4) { // dagger communicators - mh_send_fwd[b][j][2*i+1] = comm_declare_strided_send_relative(base[3], i, +1, blksize, Nblocks, Stride); - mh_send_back[b][j][2*i+1] = comm_declare_strided_send_relative(base[1], i, -1, blksize, Nblocks, Stride); + mh_send_fwd[b][j][2*i+1] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[3], i, +1, blksize, Nblocks, Stride) : NULL; + mh_send_back[b][j][2*i+1] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[1], i, -1, blksize, Nblocks, Stride) : NULL; } else { mh_send_fwd[b][j][2*i+1] = mh_send_fwd[b][j][2*i+0]; mh_send_back[b][j][2*i+1] = mh_send_back[b][j][2*i+0]; @@ -1030,8 +1031,8 @@ namespace quda { void *norm_fwd = static_cast(norm) + Nt_minus1_offset; void *norm_back = norm; // the first time slice has zero offset for(int b=0; b<2; ++b){ - mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(norm_fwd, i, +1, surfaceCB[i]*(j+1)*sizeof(float)); - mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(norm_back, i, -1, surfaceCB[i]*(j+1)*sizeof(float)); + mh_send_norm_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(norm_fwd, i, +1, surfaceCB[i]*(j+1)*sizeof(float)) : NULL; + mh_send_norm_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(norm_back, i, -1, surfaceCB[i]*(j+1)*sizeof(float)) : NULL; mh_send_norm_fwd[b][j][2*i+1] = mh_send_norm_fwd[b][j][2*i]; mh_send_norm_back[b][j][2*i+1] = mh_send_norm_back[b][j][2*i]; } @@ -1041,15 +1042,15 @@ namespace quda { if(precision == QUDA_HALF_PRECISION){ for(int b=0; b<2; ++b){ - mh_recv_norm_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, surfaceCB[i]*sizeof(float)*(j+1)); - mh_recv_norm_back[b][j][i] = comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, surfaceCB[i]*sizeof(float)*(j+1)); + mh_recv_norm_fwd[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm) : NULL; + mh_recv_norm_back[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, nbytes_Nface_norm) : NULL; } } #endif // GPU_COMMS for(int b=0; b<2; ++b){ - mh_recv_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_face[b][i], i, +1, nbytes_Nface); - mh_recv_back[b][j][i] = comm_declare_receive_relative(from_back_face[b][i], i, -1, nbytes_Nface); + mh_recv_fwd[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_fwd_face[b][i], i, +1, nbytes_Nface) : NULL; + mh_recv_back[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_back_face[b][i], i, -1, nbytes_Nface) : NULL; } @@ -1070,22 +1071,22 @@ namespace quda { for (int j=0; j