Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve robustness of GPUDirect and fix silent errors #238

Merged
merged 7 commits into from
May 14, 2015
42 changes: 33 additions & 9 deletions include/comm_quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,46 +29,70 @@ extern "C" {
int comm_coord(int dim);

/**
Create a persistent message handler for a relative send
Create a persistent message handler for a relative send. This
should not be called directly, and instead the helper macro
(without the trailing underscore) should be called instead.
@param buffer Buffer from which message will be sent
@param dim Dimension in which message will be sent
@param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
@param nbytes Size of message in bytes
*/
MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes);
MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir, size_t nbytes);

#define comm_declare_send_relative(buffer, dim, dir, nbytes) \
comm_declare_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)

/**
Create a persistent message handler for a relative receive
Create a persistent message handler for a relative send. This
should not be called directly, and instead the helper macro
(without the trailing underscore) should be called instead.
@param buffer Buffer into which message will be received
@param dim Dimension from message will be received
@param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
@param nbytes Size of message in bytes
*/
MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes);
MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir, size_t nbytes);

#define comm_declare_receive_relative(buffer, dim, dir, nbytes) \
comm_declare_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)

/**
Create a persistent strided message handler for a relative send
Create a persistent strided message handler for a relative send.
This should not be called directly, and instead the helper macro
(without the trailing underscore) should be called instead.
@param buffer Buffer from which message will be sent
@param dim Dimension in which message will be sent
@param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
@param blksize Size of block in bytes
@param nblocks Number of blocks
@param stride Stride between blocks in bytes
*/
MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir,
size_t blksize, int nblocks, size_t stride);
MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir,
size_t blksize, int nblocks, size_t stride);

#define comm_declare_strided_send_relative(buffer, dim, dir, blksize, nblocks, stride) \
comm_declare_strided_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)

/**
Create a persistent strided message handler for a relative receive
This should not be called directly, and instead the helper macro
(without the trailing underscore) should be called instead.
@param buffer Buffer into which message will be received
@param dim Dimension from message will be received
@param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
@param blksize Size of block in bytes
@param nblocks Number of blocks
@param stride Stride between blocks in bytes
*/
MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir,
size_t blksize, int nblocks, size_t stride);
MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir,
size_t blksize, int nblocks, size_t stride);

#define comm_declare_strided_receive_relative(buffer, dim, dir, blksize, nblocks, stride) \
comm_declare_strided_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)

void comm_finalize(void);
void comm_dim_partitioned_set(int dim);
Expand Down
47 changes: 21 additions & 26 deletions lib/color_spinor_field.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,32 +52,27 @@ namespace quda {
if (getVerbosity() == QUDA_DEBUG_VERBOSE)
printfQuda("Precision = %d, Subset = %d\n", precision, siteSubset);

int num_faces = 1;
int num_norm_faces=2;

// FIXME - this is a hack from hell that needs to be fixed. When
// the TIFR interface is enabled we are forcing naive staggered
// support which breaks asqtad/hisq fermions. The problem occurs
// because the ghost zone is allocated before we know which
// operator (and hence number of faces are needed). One solution
// may be to separate the ghost zone memory allocation from the
// field itself, which has other benefits (1. on multi-gpu
// machines with UVA, we can read the ghost zone directly from the
// neighbouring field and 2.) we can use a single contiguous
// buffer for the ghost zone and its norm which will reduce
// latency for half precision and allow us to enable GPU_COMMS
// support for half precision).
#ifdef BUILD_TIFR_INTERFACE
if (nSpin == 1) { //staggered
num_faces=2;
num_norm_faces=2;
}
#else
if (nSpin == 1) { // improved staggered
num_faces=6;
num_norm_faces=6;
}
#endif
// FIXME - The ghost zone is allocated before we know which
// operator (and hence number of faces are needed), thus we
// allocate a ghost zone large enough to cope with the maximum
// number of faces. All Wilson-like operators support only
// involve the excahnge of one face so this is no problem.
// However, for staggered fermions, we have either nFace=1 or 3,
// thus we allocated using the latter. This will artificially
// raise the GPU memory requirements for naive staggered fermions.
// One potential future solution may be to separate the ghost zone
// memory allocation from the field itself, which has other
// benefits (1. on multi-gpu machines with UVA, we can read the
// ghost zone directly from the neighbouring field and 2.) we can
// use a single contiguous buffer for the ghost zone and its norm
// which will reduce latency for half precision and allow us to
// enable GPU_COMMS support for half precision).
int nFaceGhost = (nSpin == 1) ? 3 : 1;

// For Wilson we have the number of effective faces since the
// fields are spin projected.
int num_faces = ((nSpin == 1) ? 2 : 1) * nFaceGhost;
int num_norm_faces = 2*nFaceGhost;

// calculate size of ghost zone required
int ghostVolume = 0;
Expand Down
118 changes: 111 additions & 7 deletions lib/comm_common.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <unistd.h> // for gethostname()
#include <assert.h>

#include <quda_internal.h>
#include <comm_quda.h>
Expand Down Expand Up @@ -226,20 +227,69 @@ int comm_coord(int dim)
/**
* Send to the "dir" direction in the "dim" dimension
*/
MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir, size_t nbytes)
{
#ifdef HOST_DEBUG
cudaPointerAttributes attributes;
cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
// test this memory allocation is ok by doing a memcpy from it
void *tmp = safe_malloc(nbytes);
try {
std::copy(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, static_cast<char*>(tmp));
} catch(std::exception &e) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
errorQuda("aborting");
}
if (err != cudaSuccess) cudaGetLastError();
host_free(tmp);
} else {
// test this memory allocation is ok by doing a memcpy from it
void *tmp = device_malloc(nbytes);
cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice);
if (err != cudaSuccess) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
errorQuda("aborting with error %s", cudaGetErrorString(err));
}
device_free(tmp);
}
#endif

int disp[QUDA_MAX_DIM] = {0};
disp[dim] = dir;

return comm_declare_send_displaced(buffer, disp, nbytes);
}


/**
* Receive from the "dir" direction in the "dim" dimension
*/
MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir, size_t nbytes)
{
#ifdef HOST_DEBUG
cudaPointerAttributes attributes;
cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
// test this memory allocation is ok by filling it
try {
std::fill(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, 0);
} catch(std::exception &e) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
errorQuda("aborting");
}
if (err != cudaSuccess) cudaGetLastError();
} else {
// test this memory allocation is ok by doing a memset
cudaError_t err = cudaMemset(buffer, 0, nbytes);
if (err != cudaSuccess) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
errorQuda("aborting with error %s", cudaGetErrorString(err));
}
}
#endif

int disp[QUDA_MAX_DIM] = {0};
disp[dim] = dir;

Expand All @@ -249,9 +299,38 @@ MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t
/**
* Strided send to the "dir" direction in the "dim" dimension
*/
MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir,
size_t blksize, int nblocks, size_t stride)
MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
{
#ifdef HOST_DEBUG
cudaPointerAttributes attributes;
cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
// test this memory allocation is ok by doing a memcpy from it
void *tmp = safe_malloc(blksize*nblocks);
try {
for (int i=0; i<nblocks; i++)
std::copy(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, static_cast<char*>(tmp));
} catch(std::exception &e) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
file, line, func, dim, dir, blksize, nblocks, stride);
errorQuda("aborting");
}
host_free(tmp);
if (err != cudaSuccess) cudaGetLastError();
} else {
// test this memory allocation is ok by doing a memcpy from it
void *tmp = device_malloc(blksize*nblocks);
cudaError_t err = cudaMemcpy2D(tmp, blksize, buffer, stride, blksize, nblocks, cudaMemcpyDeviceToDevice);
if (err != cudaSuccess) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
file, line, func, dim, dir, blksize, nblocks, stride);
errorQuda("aborting with error %s", cudaGetErrorString(err));
}
device_free(tmp);
}
#endif

int disp[QUDA_MAX_DIM] = {0};
disp[dim] = dir;

Expand All @@ -262,9 +341,34 @@ MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir,
/**
* Strided receive from the "dir" direction in the "dim" dimension
*/
MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir,
size_t blksize, int nblocks, size_t stride)
MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
{
#ifdef HOST_DEBUG
cudaPointerAttributes attributes;
cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
// test this memory allocation is ok by filling it
try {
for (int i=0; i<nblocks; i++)
std::fill(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, 0);
} catch(std::exception &e) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
file, line, func, dim, dir, blksize, nblocks, stride);
errorQuda("aborting");
}
if (err != cudaSuccess) cudaGetLastError();
} else {
// test this memory allocation is ok by doing a memset
cudaError_t err = cudaMemset2D(buffer, stride, 0, blksize, nblocks);
if (err != cudaSuccess) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
file, line, func, dim, dir, blksize, nblocks, stride);
errorQuda("aborting with error %s", cudaGetErrorString(err));
}
}
#endif

int disp[QUDA_MAX_DIM] = {0};
disp[dim] = dir;

Expand Down
Loading