lattice · mathiaswagner · May 14, 2015 · May 12, 2015 · May 12, 2015 · May 13, 2015
@@ -29,46 +29,70 @@ extern "C" {
   int comm_coord(int dim);
 
   /**
-     Create a persistent message handler for a relative send
+     Create a persistent message handler for a relative send.  This
+     should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer from which message will be sent
      @param dim Dimension in which message will be sent
      @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
      @param nbytes Size of message in bytes
   */
-  MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes);
+  MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
+					 void *buffer, int dim, int dir, size_t nbytes);
+
+#define comm_declare_send_relative(buffer, dim, dir, nbytes)		\
+  comm_declare_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)
 
   /**
-     Create a persistent message handler for a relative receive
+     Create a persistent message handler for a relative send.  This
+     should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer into which message will be received
      @param dim Dimension from message will be received
      @param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
      @param nbytes Size of message in bytes
   */
-  MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes);
+  MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
+					    void *buffer, int dim, int dir, size_t nbytes);
+
+#define comm_declare_receive_relative(buffer, dim, dir, nbytes)		\
+  comm_declare_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)
 
   /**
-     Create a persistent strided message handler for a relative send
+     Create a persistent strided message handler for a relative send.
+     This should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer from which message will be sent
      @param dim Dimension in which message will be sent
      @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
      @param blksize Size of block in bytes
      @param nblocks Number of blocks
      @param stride Stride between blocks in bytes
   */
-  MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, 
-						size_t blksize, int nblocks, size_t stride);
+  MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
+						 void *buffer, int dim, int dir,
+						 size_t blksize, int nblocks, size_t stride);
+
+#define comm_declare_strided_send_relative(buffer, dim, dir, blksize, nblocks, stride) \
+  comm_declare_strided_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)
 
   /**
      Create a persistent strided message handler for a relative receive
+     This should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer into which message will be received
      @param dim Dimension from message will be received
      @param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
      @param blksize Size of block in bytes
      @param nblocks Number of blocks
      @param stride Stride between blocks in bytes
   */
-  MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir, 
-						   size_t blksize, int nblocks, size_t stride);
+  MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
+						    void *buffer, int dim, int dir,
+						    size_t blksize, int nblocks, size_t stride);
+
+#define comm_declare_strided_receive_relative(buffer, dim, dir, blksize, nblocks, stride) \
+  comm_declare_strided_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)
 
   void comm_finalize(void);
   void comm_dim_partitioned_set(int dim);

@@ -52,32 +52,27 @@ namespace quda {
     if (getVerbosity() == QUDA_DEBUG_VERBOSE) 
       printfQuda("Precision = %d, Subset = %d\n", precision, siteSubset);
 
-    int num_faces = 1;
-    int num_norm_faces=2;
-
-    // FIXME - this is a hack from hell that needs to be fixed.  When
-    // the TIFR interface is enabled we are forcing naive staggered
-    // support which breaks asqtad/hisq fermions.  The problem occurs
-    // because the ghost zone is allocated before we know which
-    // operator (and hence number of faces are needed).  One solution
-    // may be to separate the ghost zone memory allocation from the
-    // field itself, which has other benefits (1. on multi-gpu
-    // machines with UVA, we can read the ghost zone directly from the
-    // neighbouring field and 2.) we can use a single contiguous
-    // buffer for the ghost zone and its norm which will reduce
-    // latency for half precision and allow us to enable GPU_COMMS
-    // support for half precision).
-#ifdef BUILD_TIFR_INTERFACE
-    if (nSpin == 1) { //staggered
-      num_faces=2;
-      num_norm_faces=2;
-    }
-#else
-    if (nSpin == 1) { // improved staggered
-      num_faces=6;
-      num_norm_faces=6;
-    }
-#endif
+    // FIXME - The ghost zone is allocated before we know which
+    // operator (and hence number of faces are needed), thus we
+    // allocate a ghost zone large enough to cope with the maximum
+    // number of faces.  All Wilson-like operators support only
+    // involve the excahnge of one face so this is no problem.
+    // However, for staggered fermions, we have either nFace=1 or 3,
+    // thus we allocated using the latter.  This will artificially
+    // raise the GPU memory requirements for naive staggered fermions.
+    // One potential future solution may be to separate the ghost zone
+    // memory allocation from the field itself, which has other
+    // benefits (1. on multi-gpu machines with UVA, we can read the
+    // ghost zone directly from the neighbouring field and 2.) we can
+    // use a single contiguous buffer for the ghost zone and its norm
+    // which will reduce latency for half precision and allow us to
+    // enable GPU_COMMS support for half precision).
+    int nFaceGhost = (nSpin == 1) ? 3 : 1;
+
+    // For Wilson we have the number of effective faces since the
+    // fields are spin projected.
+    int num_faces = ((nSpin == 1) ? 2 : 1) * nFaceGhost;
+    int num_norm_faces = 2*nFaceGhost;
 
     // calculate size of ghost zone required
     int ghostVolume = 0;

@@ -1,4 +1,5 @@
 #include <unistd.h> // for gethostname()
+#include <assert.h>
 
 #include <quda_internal.h>
 #include <comm_quda.h>
@@ -226,20 +227,69 @@ int comm_coord(int dim)
 /**
  * Send to the "dir" direction in the "dim" dimension
  */
-MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
+MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
+				       void *buffer, int dim, int dir, size_t nbytes)
 {
+#ifdef HOST_DEBUG
+  cudaPointerAttributes attributes;
+  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
+  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
+    // test this memory allocation is ok by doing a memcpy from it
+    void *tmp = safe_malloc(nbytes);
+    try {
+      std::copy(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, static_cast<char*>(tmp));
+    } catch(std::exception &e) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
+      errorQuda("aborting");
+    }
+    if (err != cudaSuccess) cudaGetLastError();
+    host_free(tmp);
+  } else {
+    // test this memory allocation is ok by doing a memcpy from it
+    void *tmp = device_malloc(nbytes);
+    cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice);
+    if (err != cudaSuccess) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
+      errorQuda("aborting with error %s", cudaGetErrorString(err));
+    }
+    device_free(tmp);
+  }
+#endif
+
   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;
 
   return comm_declare_send_displaced(buffer, disp, nbytes);
 }
 
-
 /**
  * Receive from the "dir" direction in the "dim" dimension
  */
-MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
+MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
+					  void *buffer, int dim, int dir, size_t nbytes)
 {
+#ifdef HOST_DEBUG
+  cudaPointerAttributes attributes;
+  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
+  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
+    // test this memory allocation is ok by filling it
+    try {
+      std::fill(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, 0);
+    } catch(std::exception &e) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
+      errorQuda("aborting");
+    }
+    if (err != cudaSuccess) cudaGetLastError();
+  } else {
+    // test this memory allocation is ok by doing a memset
+    cudaError_t err = cudaMemset(buffer, 0, nbytes);
+    if (err != cudaSuccess) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
+      errorQuda("aborting with error %s", cudaGetErrorString(err));
+    }
+  }
+#endif
+
   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;
 
@@ -249,9 +299,38 @@ MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t
 /**
  * Strided send to the "dir" direction in the "dim" dimension
  */
-MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, 
-					      size_t blksize, int nblocks, size_t stride)
+MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
+					       void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
 {
+#ifdef HOST_DEBUG
+  cudaPointerAttributes attributes;
+  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
+  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
+    // test this memory allocation is ok by doing a memcpy from it
+    void *tmp = safe_malloc(blksize*nblocks);
+    try {
+      for (int i=0; i<nblocks; i++)
+	std::copy(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, static_cast<char*>(tmp));
+    } catch(std::exception &e) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
+		 file, line, func, dim, dir, blksize, nblocks, stride);
+      errorQuda("aborting");
+      }
+    host_free(tmp);
+    if (err != cudaSuccess) cudaGetLastError();
+  } else {
+    // test this memory allocation is ok by doing a memcpy from it
+    void *tmp = device_malloc(blksize*nblocks);
+    cudaError_t err = cudaMemcpy2D(tmp, blksize, buffer, stride, blksize, nblocks, cudaMemcpyDeviceToDevice);
+    if (err != cudaSuccess) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
+		 file, line, func, dim, dir, blksize, nblocks, stride);
+      errorQuda("aborting with error %s", cudaGetErrorString(err));
+    }
+    device_free(tmp);
+  }
+#endif
+
   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;
 
@@ -262,9 +341,34 @@ MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir,
 /**
  * Strided receive from the "dir" direction in the "dim" dimension
  */
-MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir, 
-						 size_t blksize, int nblocks, size_t stride)
+MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
+						  void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
 {
+#ifdef HOST_DEBUG
+  cudaPointerAttributes attributes;
+  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
+  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
+    // test this memory allocation is ok by filling it
+    try {
+      for (int i=0; i<nblocks; i++)
+	std::fill(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, 0);
+    } catch(std::exception &e) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
+		 file, line, func, dim, dir, blksize, nblocks, stride);
+      errorQuda("aborting");
+    }
+    if (err != cudaSuccess) cudaGetLastError();
+  } else {
+    // test this memory allocation is ok by doing a memset
+    cudaError_t err = cudaMemset2D(buffer, stride, 0, blksize, nblocks);
+    if (err != cudaSuccess) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
+		 file, line, func, dim, dir, blksize, nblocks, stride);
+      errorQuda("aborting with error %s", cudaGetErrorString(err));
+    }
+  }
+#endif
+
   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;