Skip to content

Commit

Permalink
Merge pull request #238 from lattice/hotfix/gdr
Browse files Browse the repository at this point in the history
Improve robustness of GPUDirect and fix silent errors
  • Loading branch information
Mathias Wagner committed May 14, 2015
2 parents 75b0281 + 00b01f1 commit 0859c72
Show file tree
Hide file tree
Showing 4 changed files with 191 additions and 67 deletions.
42 changes: 33 additions & 9 deletions include/comm_quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,46 +29,70 @@ extern "C" {
int comm_coord(int dim);

/**
Create a persistent message handler for a relative send
Create a persistent message handler for a relative send. This
should not be called directly, and instead the helper macro
(without the trailing underscore) should be called instead.
@param buffer Buffer from which message will be sent
@param dim Dimension in which message will be sent
@param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
@param nbytes Size of message in bytes
*/
MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes);
MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir, size_t nbytes);

#define comm_declare_send_relative(buffer, dim, dir, nbytes) \
comm_declare_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)

/**
Create a persistent message handler for a relative receive
Create a persistent message handler for a relative send. This
should not be called directly, and instead the helper macro
(without the trailing underscore) should be called instead.
@param buffer Buffer into which message will be received
@param dim Dimension from message will be received
@param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
@param nbytes Size of message in bytes
*/
MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes);
MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir, size_t nbytes);

#define comm_declare_receive_relative(buffer, dim, dir, nbytes) \
comm_declare_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)

/**
Create a persistent strided message handler for a relative send
Create a persistent strided message handler for a relative send.
This should not be called directly, and instead the helper macro
(without the trailing underscore) should be called instead.
@param buffer Buffer from which message will be sent
@param dim Dimension in which message will be sent
@param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
@param blksize Size of block in bytes
@param nblocks Number of blocks
@param stride Stride between blocks in bytes
*/
MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir,
size_t blksize, int nblocks, size_t stride);
MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir,
size_t blksize, int nblocks, size_t stride);

#define comm_declare_strided_send_relative(buffer, dim, dir, blksize, nblocks, stride) \
comm_declare_strided_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)

/**
Create a persistent strided message handler for a relative receive
This should not be called directly, and instead the helper macro
(without the trailing underscore) should be called instead.
@param buffer Buffer into which message will be received
@param dim Dimension from message will be received
@param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
@param blksize Size of block in bytes
@param nblocks Number of blocks
@param stride Stride between blocks in bytes
*/
MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir,
size_t blksize, int nblocks, size_t stride);
MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir,
size_t blksize, int nblocks, size_t stride);

#define comm_declare_strided_receive_relative(buffer, dim, dir, blksize, nblocks, stride) \
comm_declare_strided_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)

void comm_finalize(void);
void comm_dim_partitioned_set(int dim);
Expand Down
47 changes: 21 additions & 26 deletions lib/color_spinor_field.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,32 +52,27 @@ namespace quda {
if (getVerbosity() == QUDA_DEBUG_VERBOSE)
printfQuda("Precision = %d, Subset = %d\n", precision, siteSubset);

int num_faces = 1;
int num_norm_faces=2;

// FIXME - this is a hack from hell that needs to be fixed. When
// the TIFR interface is enabled we are forcing naive staggered
// support which breaks asqtad/hisq fermions. The problem occurs
// because the ghost zone is allocated before we know which
// operator (and hence number of faces are needed). One solution
// may be to separate the ghost zone memory allocation from the
// field itself, which has other benefits (1. on multi-gpu
// machines with UVA, we can read the ghost zone directly from the
// neighbouring field and 2.) we can use a single contiguous
// buffer for the ghost zone and its norm which will reduce
// latency for half precision and allow us to enable GPU_COMMS
// support for half precision).
#ifdef BUILD_TIFR_INTERFACE
if (nSpin == 1) { //staggered
num_faces=2;
num_norm_faces=2;
}
#else
if (nSpin == 1) { // improved staggered
num_faces=6;
num_norm_faces=6;
}
#endif
// FIXME - The ghost zone is allocated before we know which
// operator (and hence number of faces are needed), thus we
// allocate a ghost zone large enough to cope with the maximum
// number of faces. All Wilson-like operators support only
// involve the excahnge of one face so this is no problem.
// However, for staggered fermions, we have either nFace=1 or 3,
// thus we allocated using the latter. This will artificially
// raise the GPU memory requirements for naive staggered fermions.
// One potential future solution may be to separate the ghost zone
// memory allocation from the field itself, which has other
// benefits (1. on multi-gpu machines with UVA, we can read the
// ghost zone directly from the neighbouring field and 2.) we can
// use a single contiguous buffer for the ghost zone and its norm
// which will reduce latency for half precision and allow us to
// enable GPU_COMMS support for half precision).
int nFaceGhost = (nSpin == 1) ? 3 : 1;

// For Wilson we have the number of effective faces since the
// fields are spin projected.
int num_faces = ((nSpin == 1) ? 2 : 1) * nFaceGhost;
int num_norm_faces = 2*nFaceGhost;

// calculate size of ghost zone required
int ghostVolume = 0;
Expand Down
118 changes: 111 additions & 7 deletions lib/comm_common.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <unistd.h> // for gethostname()
#include <assert.h>

#include <quda_internal.h>
#include <comm_quda.h>
Expand Down Expand Up @@ -226,20 +227,69 @@ int comm_coord(int dim)
/**
* Send to the "dir" direction in the "dim" dimension
*/
MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir, size_t nbytes)
{
#ifdef HOST_DEBUG
cudaPointerAttributes attributes;
cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
// test this memory allocation is ok by doing a memcpy from it
void *tmp = safe_malloc(nbytes);
try {
std::copy(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, static_cast<char*>(tmp));
} catch(std::exception &e) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
errorQuda("aborting");
}
if (err != cudaSuccess) cudaGetLastError();
host_free(tmp);
} else {
// test this memory allocation is ok by doing a memcpy from it
void *tmp = device_malloc(nbytes);
cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice);
if (err != cudaSuccess) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
errorQuda("aborting with error %s", cudaGetErrorString(err));
}
device_free(tmp);
}
#endif

int disp[QUDA_MAX_DIM] = {0};
disp[dim] = dir;

return comm_declare_send_displaced(buffer, disp, nbytes);
}


/**
* Receive from the "dir" direction in the "dim" dimension
*/
MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir, size_t nbytes)
{
#ifdef HOST_DEBUG
cudaPointerAttributes attributes;
cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
// test this memory allocation is ok by filling it
try {
std::fill(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, 0);
} catch(std::exception &e) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
errorQuda("aborting");
}
if (err != cudaSuccess) cudaGetLastError();
} else {
// test this memory allocation is ok by doing a memset
cudaError_t err = cudaMemset(buffer, 0, nbytes);
if (err != cudaSuccess) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
errorQuda("aborting with error %s", cudaGetErrorString(err));
}
}
#endif

int disp[QUDA_MAX_DIM] = {0};
disp[dim] = dir;

Expand All @@ -249,9 +299,38 @@ MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t
/**
* Strided send to the "dir" direction in the "dim" dimension
*/
MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir,
size_t blksize, int nblocks, size_t stride)
MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
{
#ifdef HOST_DEBUG
cudaPointerAttributes attributes;
cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
// test this memory allocation is ok by doing a memcpy from it
void *tmp = safe_malloc(blksize*nblocks);
try {
for (int i=0; i<nblocks; i++)
std::copy(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, static_cast<char*>(tmp));
} catch(std::exception &e) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
file, line, func, dim, dir, blksize, nblocks, stride);
errorQuda("aborting");
}
host_free(tmp);
if (err != cudaSuccess) cudaGetLastError();
} else {
// test this memory allocation is ok by doing a memcpy from it
void *tmp = device_malloc(blksize*nblocks);
cudaError_t err = cudaMemcpy2D(tmp, blksize, buffer, stride, blksize, nblocks, cudaMemcpyDeviceToDevice);
if (err != cudaSuccess) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
file, line, func, dim, dir, blksize, nblocks, stride);
errorQuda("aborting with error %s", cudaGetErrorString(err));
}
device_free(tmp);
}
#endif

int disp[QUDA_MAX_DIM] = {0};
disp[dim] = dir;

Expand All @@ -262,9 +341,34 @@ MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir,
/**
* Strided receive from the "dir" direction in the "dim" dimension
*/
MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir,
size_t blksize, int nblocks, size_t stride)
MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
{
#ifdef HOST_DEBUG
cudaPointerAttributes attributes;
cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
// test this memory allocation is ok by filling it
try {
for (int i=0; i<nblocks; i++)
std::fill(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, 0);
} catch(std::exception &e) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
file, line, func, dim, dir, blksize, nblocks, stride);
errorQuda("aborting");
}
if (err != cudaSuccess) cudaGetLastError();
} else {
// test this memory allocation is ok by doing a memset
cudaError_t err = cudaMemset2D(buffer, stride, 0, blksize, nblocks);
if (err != cudaSuccess) {
printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
file, line, func, dim, dir, blksize, nblocks, stride);
errorQuda("aborting with error %s", cudaGetErrorString(err));
}
}
#endif

int disp[QUDA_MAX_DIM] = {0};
disp[dim] = dir;

Expand Down
Loading

0 comments on commit 0859c72

Please sign in to comment.