From 87f8cca3e89617d26579d7d1f2ee72abdab95eb3 Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Tue, 12 May 2015 14:43:46 -0700
Subject: [PATCH 1/7] Fixed assertion failure for GDR receive buffer with Nface
 > 1 for Wilson fermions.

---
 lib/color_spinor_field.cpp | 40 +++++++++++++-------------------------
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 97a09cf54b..382d393ec3 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -52,32 +52,20 @@ namespace quda {
     if (getVerbosity() == QUDA_DEBUG_VERBOSE) 
       printfQuda("Precision = %d, Subset = %d\n", precision, siteSubset);
 
-    int num_faces = 1;
-    int num_norm_faces=2;
-
-    // FIXME - this is a hack from hell that needs to be fixed.  When
-    // the TIFR interface is enabled we are forcing naive staggered
-    // support which breaks asqtad/hisq fermions.  The problem occurs
-    // because the ghost zone is allocated before we know which
-    // operator (and hence number of faces are needed).  One solution
-    // may be to separate the ghost zone memory allocation from the
-    // field itself, which has other benefits (1. on multi-gpu
-    // machines with UVA, we can read the ghost zone directly from the
-    // neighbouring field and 2.) we can use a single contiguous
-    // buffer for the ghost zone and its norm which will reduce
-    // latency for half precision and allow us to enable GPU_COMMS
-    // support for half precision).
-#ifdef BUILD_TIFR_INTERFACE
-    if (nSpin == 1) { //staggered
-      num_faces=2;
-      num_norm_faces=2;
-    }
-#else
-    if (nSpin == 1) { // improved staggered
-      num_faces=6;
-      num_norm_faces=6;
-    }
-#endif
+    // FIXME - The ghost zone is allocated before we know which
+    // operator (and hence number of faces are needed), thus we
+    // allocate a ghost zone large enough to cope with the maximum
+    // number of faces (maxNface).  This can artificially raise the
+    // GPU memory requirements.  One potential future solution may be
+    // to separate the ghost zone memory allocation from the field
+    // itself, which has other benefits (1. on multi-gpu machines with
+    // UVA, we can read the ghost zone directly from the neighbouring
+    // field and 2.) we can use a single contiguous buffer for the
+    // ghost zone and its norm which will reduce latency for half
+    // precision and allow us to enable GPU_COMMS support for half
+    // precision).
+    int num_faces = ((nSpin == 1) ? 2 : 1) * maxNface;
+    int num_norm_faces = 2*maxNface;
 
     // calculate size of ghost zone required
     int ghostVolume = 0;

From 705adab3bda994327bbe495a2f2b3ab72bbf0cf2 Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Tue, 12 May 2015 15:00:57 -0700
Subject: [PATCH 2/7] Small clean up of message handle creation in
 cuda_color_field.cpp and added debugging of communicator declaration in
 comm_common.cpp.

---
 lib/comm_common.cpp            | 20 ++++++++++++++++++++
 lib/cuda_color_spinor_field.cu |  9 +++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp
index e1ff605351..ab4de2ef68 100644
--- a/lib/comm_common.cpp
+++ b/lib/comm_common.cpp
@@ -228,6 +228,16 @@ int comm_coord(int dim)
  */
 MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
 {
+#ifdef HOST_DEBUG
+  cudaPointerAttributes attributes;
+  cudaPointerGetAttributes(&attributes, buffer);
+  if (attributes->memoryType == cudaMemoryTypeHost) {
+    memset(buffer, 0, nbytes);
+  } else {
+    assert(cudaSuccess == cudaMemset(buffer, 0, nbytes));
+  }
+#endif
+
   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;
 
@@ -240,6 +250,16 @@ MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nby
  */
 MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
 {
+#ifdef HOST_DEBUG
+  cudaPointerAttributes attributes;
+  cudaPointerGetAttributes(&attributes, buffer);
+  if (attributes->memoryType == cudaMemoryTypeHost) {
+    memset(buffer, 0, nbytes);
+  } else {
+    assert(cudaSuccess == cudaMemset(buffer, 0, nbytes));
+  }
+#endif
+
   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;
 
diff --git a/lib/cuda_color_spinor_field.cu b/lib/cuda_color_spinor_field.cu
index 5611178541..adae047cf7 100644
--- a/lib/cuda_color_spinor_field.cu
+++ b/lib/cuda_color_spinor_field.cu
@@ -945,6 +945,7 @@ namespace quda {
 	  if (!commDimPartitioned(i)) continue;
 #ifdef GPU_COMMS
 	  size_t nbytes_Nface = surfaceCB[i]*Ndof*precision*(j+1);
+	  size_t nbytes_Nface_norm = surfaceCB[i]*(j+1)*sizeof(float);
 	  if (i != 3 || getKernelPackT() || getTwistPack()) {
 #else 
 	    size_t nbytes_Nface = (nbytes[i] / maxNface) * (j+1);
@@ -959,8 +960,8 @@ namespace quda {
 
 	    if(precision == QUDA_HALF_PRECISION){
 	      for(int b=0; b<2; ++b){
-		mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, surfaceCB[i]*(j+1)*sizeof(float)); 
-		mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(my_back_norm_face[b][i], i, -1, surfaceCB[i]*(j+1)*sizeof(float));
+		mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm);
+		mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(my_back_norm_face[b][i], i, -1, nbytes_Nface_norm);
 		mh_send_norm_fwd[b][j][2*i+1] = mh_send_norm_fwd[b][j][2*i];
 		mh_send_norm_back[b][j][2*i+1] = mh_send_norm_back[b][j][2*i]; 	
 	      }
@@ -1041,8 +1042,8 @@ namespace quda {
 
 	  if(precision == QUDA_HALF_PRECISION){
             for(int b=0; b<2; ++b){
-	      mh_recv_norm_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, surfaceCB[i]*sizeof(float)*(j+1));
-	      mh_recv_norm_back[b][j][i] = comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, surfaceCB[i]*sizeof(float)*(j+1));
+	      mh_recv_norm_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm);
+	      mh_recv_norm_back[b][j][i] = comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, nbytes_Nface_norm);
             }
 	  }
 #endif // GPU_COMMS

From b03757e30fc35333da5a3fa69e996a5de5493b5b Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Tue, 12 May 2015 21:43:59 -0700
Subject: [PATCH 3/7] Error checking for comms send buffers must be
 non-destructive.  Added CPU comms buffer checking using std::fill and
 std::copy.

---
 include/comm_quda.h | 20 +++++++++++++----
 lib/comm_common.cpp | 52 +++++++++++++++++++++++++++++++++++----------
 2 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/include/comm_quda.h b/include/comm_quda.h
index 5deffc865a..09cce216ef 100644
--- a/include/comm_quda.h
+++ b/include/comm_quda.h
@@ -29,22 +29,34 @@ extern "C" {
   int comm_coord(int dim);
 
   /**
-     Create a persistent message handler for a relative send
+     Create a persistent message handler for a relative send.  This
+     should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer from which message will be sent
      @param dim Dimension in which message will be sent
      @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
      @param nbytes Size of message in bytes
   */
-  MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes);
+  MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
+					 void *buffer, int dim, int dir, size_t nbytes);
+
+#define comm_declare_send_relative(buffer, dim, dir, nbytes)		\
+  comm_declare_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)
 
   /**
-     Create a persistent message handler for a relative receive
+     Create a persistent message handler for a relative send.  This
+     should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer into which message will be received
      @param dim Dimension from message will be received
      @param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
      @param nbytes Size of message in bytes
   */
-  MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes);
+  MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
+					    void *buffer, int dim, int dir, size_t nbytes);
+
+#define comm_declare_receive_relative(buffer, dim, dir, nbytes)		\
+  comm_declare_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)
 
   /**
      Create a persistent strided message handler for a relative send
diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp
index ab4de2ef68..b2a9454b6f 100644
--- a/lib/comm_common.cpp
+++ b/lib/comm_common.cpp
@@ -1,4 +1,5 @@
 #include <unistd.h> // for gethostname()
+#include <assert.h>
 
 #include <quda_internal.h>
 #include <comm_quda.h>
@@ -226,15 +227,32 @@ int comm_coord(int dim)
 /**
  * Send to the "dir" direction in the "dim" dimension
  */
-MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
+MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
+				       void *buffer, int dim, int dir, size_t nbytes)
 {
 #ifdef HOST_DEBUG
   cudaPointerAttributes attributes;
-  cudaPointerGetAttributes(&attributes, buffer);
-  if (attributes->memoryType == cudaMemoryTypeHost) {
-    memset(buffer, 0, nbytes);
+  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
+  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
+    // test this memory allocation is ok by doing a memcpy from it
+    void *tmp = safe_malloc(nbytes);
+    try {
+      std::copy(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, static_cast<char*>(tmp));
+    } catch(std::exception &e) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
+      errorQuda("Aborting");
+    }
+    if (err != cudaSuccess) cudaGetLastError();
+    host_free(tmp);
   } else {
-    assert(cudaSuccess == cudaMemset(buffer, 0, nbytes));
+    // test this memory allocation is ok by doing a memcpy from it
+    void *tmp = device_malloc(nbytes);
+    cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice);
+    if (err != cudaSuccess) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
+      errorQuda("Aborting with error %s", cudaGetErrorString(err));
+    }
+    device_free(tmp);
   }
 #endif
 
@@ -244,19 +262,31 @@ MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nby
   return comm_declare_send_displaced(buffer, disp, nbytes);
 }
 
-
 /**
  * Receive from the "dir" direction in the "dim" dimension
  */
-MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
+MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
+					  void *buffer, int dim, int dir, size_t nbytes)
 {
 #ifdef HOST_DEBUG
   cudaPointerAttributes attributes;
-  cudaPointerGetAttributes(&attributes, buffer);
-  if (attributes->memoryType == cudaMemoryTypeHost) {
-    memset(buffer, 0, nbytes);
+  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
+  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
+    // test this memory allocation is ok by filling it
+    try {
+      std::fill(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, 0);
+    } catch(std::exception &e) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
+      errorQuda("Aborting");
+    }
+    if (err != cudaSuccess) cudaGetLastError();
   } else {
-    assert(cudaSuccess == cudaMemset(buffer, 0, nbytes));
+    // test this memory allocation is ok by doing a memset
+    cudaError_t err = cudaMemset(buffer, 0, nbytes);
+    if (err != cudaSuccess) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
+      errorQuda("Aborting with error %s", cudaGetErrorString(err));
+    }
   }
 #endif
 

From 3b56ea3f982afbdf1a6cff668573fb0bc26d209c Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Wed, 13 May 2015 10:36:21 -0700
Subject: [PATCH 4/7] In cudaColorSpinorField::createComms, only allocate the
 send message handlers for the requested number of faces.

---
 lib/cuda_color_spinor_field.cu | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/lib/cuda_color_spinor_field.cu b/lib/cuda_color_spinor_field.cu
index adae047cf7..89ffe8d2b4 100644
--- a/lib/cuda_color_spinor_field.cu
+++ b/lib/cuda_color_spinor_field.cu
@@ -552,7 +552,6 @@ namespace quda {
     // only allocate if not already allocated or buffer required is bigger than previously
     if(initGhostFaceBuffer == 0 || faceBytes > ghostFaceBytes){    
 
-
       if (initGhostFaceBuffer){
         for(int b=0; b<2; ++b) device_free(ghostFaceBuffer[b]); 
       }
@@ -951,8 +950,8 @@ namespace quda {
 	    size_t nbytes_Nface = (nbytes[i] / maxNface) * (j+1);
 #endif
 	    for(int b=0; b<2; ++b){
-	      mh_send_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_face[b][i], i, +1, nbytes_Nface);
-	      mh_send_back[b][j][2*i+0] = comm_declare_send_relative(my_back_face[b][i], i, -1, nbytes_Nface);
+	      mh_send_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_fwd_face[b][i], i, +1, nbytes_Nface) : NULL;
+	      mh_send_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_back_face[b][i], i, -1, nbytes_Nface) : NULL;
 	      mh_send_fwd[b][j][2*i+1] = mh_send_fwd[b][j][2*i]; // alias pointers
 	      mh_send_back[b][j][2*i+1] = mh_send_back[b][j][2*i]; // alias pointers
 	    }
@@ -960,8 +959,8 @@ namespace quda {
 
 	    if(precision == QUDA_HALF_PRECISION){
 	      for(int b=0; b<2; ++b){
-		mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm);
-		mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(my_back_norm_face[b][i], i, -1, nbytes_Nface_norm);
+		mh_send_norm_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm) : NULL;
+		mh_send_norm_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_back_norm_face[b][i], i, -1, nbytes_Nface_norm) : NULL;
 		mh_send_norm_fwd[b][j][2*i+1] = mh_send_norm_fwd[b][j][2*i];
 		mh_send_norm_back[b][j][2*i+1] = mh_send_norm_back[b][j][2*i]; 	
 	      }
@@ -1013,11 +1012,12 @@ namespace quda {
 	    //printf("%d strided sends with Nface=%d Nblocks=%d blksize=%d Stride=%d\n", i, j+1, Nblocks, blksize, Stride);
 
             for(int b=0; b<2; ++b){
-	      mh_send_fwd[b][j][2*i+0] = comm_declare_strided_send_relative(base[2], i, +1, blksize, Nblocks, Stride);
-	      mh_send_back[b][j][2*i+0] = comm_declare_strided_send_relative(base[0], i, -1, blksize, Nblocks, Stride);
+	      // only allocate a communicator for the present face (this needs cleaned up)
+	      mh_send_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[2], i, +1, blksize, Nblocks, Stride) : NULL;
+	      mh_send_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[0], i, -1, blksize, Nblocks, Stride) : NULL;
 	      if (nSpin ==4) { // dagger communicators
-	        mh_send_fwd[b][j][2*i+1] = comm_declare_strided_send_relative(base[3], i, +1, blksize, Nblocks, Stride);
-	        mh_send_back[b][j][2*i+1] = comm_declare_strided_send_relative(base[1], i, -1, blksize, Nblocks, Stride);
+	        mh_send_fwd[b][j][2*i+1] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[3], i, +1, blksize, Nblocks, Stride) : NULL;
+	        mh_send_back[b][j][2*i+1] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[1], i, -1, blksize, Nblocks, Stride) : NULL;
 	      } else {
 	        mh_send_fwd[b][j][2*i+1] = mh_send_fwd[b][j][2*i+0];
 	        mh_send_back[b][j][2*i+1] = mh_send_back[b][j][2*i+0];
@@ -1031,8 +1031,8 @@ namespace quda {
 	      void *norm_fwd = static_cast<float*>(norm) + Nt_minus1_offset;
 	      void *norm_back = norm; // the first time slice has zero offset
 	      for(int b=0; b<2; ++b){
-		mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(norm_fwd, i, +1, surfaceCB[i]*(j+1)*sizeof(float)); 
-		mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(norm_back, i, -1, surfaceCB[i]*(j+1)*sizeof(float));
+		mh_send_norm_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(norm_fwd, i, +1, surfaceCB[i]*(j+1)*sizeof(float)) : NULL;
+		mh_send_norm_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(norm_back, i, -1, surfaceCB[i]*(j+1)*sizeof(float)) : NULL;
 		mh_send_norm_fwd[b][j][2*i+1] = mh_send_norm_fwd[b][j][2*i];
 		mh_send_norm_back[b][j][2*i+1] = mh_send_norm_back[b][j][2*i];  
 	      }
@@ -1073,20 +1073,20 @@ namespace quda {
 	  if (commDimPartitioned(i)) {
 	    comm_free(mh_recv_fwd[b][j][i]);
 	    comm_free(mh_recv_back[b][j][i]);
-	    comm_free(mh_send_fwd[b][j][2*i]);
-	    comm_free(mh_send_back[b][j][2*i]);
+	    if (mh_send_fwd[b][j][2*i]) comm_free(mh_send_fwd[b][j][2*i]);
+	    if (mh_send_back[b][j][2*i]) comm_free(mh_send_back[b][j][2*i]);
 	    // only in a special case are these not aliasing pointers
 #ifdef GPU_COMMS
 	    if(precision == QUDA_HALF_PRECISION){
 	      comm_free(mh_recv_norm_fwd[b][j][i]);
 	      comm_free(mh_recv_norm_back[b][j][i]);
-	      comm_free(mh_send_norm_fwd[b][j][2*i]);
-	      comm_free(mh_send_norm_back[b][j][2*i]);
+	      if (mh_send_norm_fwd[b][j][2*i]) comm_free(mh_send_norm_fwd[b][j][2*i]);
+	      if (mh_send_norm_back[b][j][2*i]) comm_free(mh_send_norm_back[b][j][2*i]);
 	    }
 
 	    if (i == 3 && !getKernelPackT() && nSpin == 4) {
-	      comm_free(mh_send_fwd[b][j][2*i+1]);
-	      comm_free(mh_send_back[b][j][2*i+1]);
+	      if (mh_send_fwd[b][j][2*i+1]) comm_free(mh_send_fwd[b][j][2*i+1]);
+	      if (mh_send_back[b][j][2*i+1]) comm_free(mh_send_back[b][j][2*i+1]);
 	    }
 #endif // GPU_COMMS
 	  }

From 36292b4e8b5ea665d25556104a85aea9ce4e733a Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Wed, 13 May 2015 12:05:38 -0700
Subject: [PATCH 5/7] Added buffer validity checking for strided message
 handlers in lib/comm_common.cpp.

---
 include/comm_quda.h | 22 ++++++++++----
 lib/comm_common.cpp | 70 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/include/comm_quda.h b/include/comm_quda.h
index 09cce216ef..06bbac2578 100644
--- a/include/comm_quda.h
+++ b/include/comm_quda.h
@@ -59,7 +59,9 @@ extern "C" {
   comm_declare_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)
 
   /**
-     Create a persistent strided message handler for a relative send
+     Create a persistent strided message handler for a relative send.
+     This should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer from which message will be sent
      @param dim Dimension in which message will be sent
      @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
@@ -67,11 +69,17 @@ extern "C" {
      @param nblocks Number of blocks
      @param stride Stride between blocks in bytes
   */
-  MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, 
-						size_t blksize, int nblocks, size_t stride);
+  MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
+						 void *buffer, int dim, int dir,
+						 size_t blksize, int nblocks, size_t stride);
+
+#define comm_declare_strided_send_relative(buffer, dim, dir, blksize, nblocks, stride) \
+  comm_declare_strided_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)
 
   /**
      Create a persistent strided message handler for a relative receive
+     This should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer into which message will be received
      @param dim Dimension from message will be received
      @param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
@@ -79,8 +87,12 @@ extern "C" {
      @param nblocks Number of blocks
      @param stride Stride between blocks in bytes
   */
-  MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir, 
-						   size_t blksize, int nblocks, size_t stride);
+  MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
+						    void *buffer, int dim, int dir,
+						    size_t blksize, int nblocks, size_t stride);
+
+#define comm_declare_strided_receive_relative(buffer, dim, dir, blksize, nblocks, stride) \
+  comm_declare_strided_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)
 
   void comm_finalize(void);
   void comm_dim_partitioned_set(int dim);
diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp
index b2a9454b6f..9285a07209 100644
--- a/lib/comm_common.cpp
+++ b/lib/comm_common.cpp
@@ -240,7 +240,7 @@ MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int l
       std::copy(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, static_cast<char*>(tmp));
     } catch(std::exception &e) {
       printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
-      errorQuda("Aborting");
+      errorQuda("aborting");
     }
     if (err != cudaSuccess) cudaGetLastError();
     host_free(tmp);
@@ -250,7 +250,7 @@ MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int l
     cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice);
     if (err != cudaSuccess) {
       printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
-      errorQuda("Aborting with error %s", cudaGetErrorString(err));
+      errorQuda("aborting with error %s", cudaGetErrorString(err));
     }
     device_free(tmp);
   }
@@ -277,7 +277,7 @@ MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, in
       std::fill(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, 0);
     } catch(std::exception &e) {
       printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
-      errorQuda("Aborting");
+      errorQuda("aborting");
     }
     if (err != cudaSuccess) cudaGetLastError();
   } else {
@@ -285,7 +285,7 @@ MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, in
     cudaError_t err = cudaMemset(buffer, 0, nbytes);
     if (err != cudaSuccess) {
       printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
-      errorQuda("Aborting with error %s", cudaGetErrorString(err));
+      errorQuda("aborting with error %s", cudaGetErrorString(err));
     }
   }
 #endif
@@ -299,9 +299,38 @@ MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, in
 /**
  * Strided send to the "dir" direction in the "dim" dimension
  */
-MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, 
-					      size_t blksize, int nblocks, size_t stride)
+MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
+					       void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
 {
+#ifdef HOST_DEBUG
+  cudaPointerAttributes attributes;
+  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
+  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
+    // test this memory allocation is ok by doing a memcpy from it
+    void *tmp = safe_malloc(blksize*nblocks);
+    try {
+      for (int i=0; i<nblocks; i++)
+	std::copy(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, static_cast<char*>(tmp));
+    } catch(std::exception &e) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
+		 file, line, func, dim, dir, blksize, nblocks, stride);
+      errorQuda("aborting");
+      }
+    host_free(tmp);
+    if (err != cudaSuccess) cudaGetLastError();
+  } else {
+    // test this memory allocation is ok by doing a memcpy from it
+    void *tmp = device_malloc(blksize*nblocks);
+    cudaError_t err = cudaMemcpy2D(tmp, blksize, buffer, stride, blksize, nblocks, cudaMemcpyDeviceToDevice);
+    if (err != cudaSuccess) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
+		 file, line, func, dim, dir, blksize, nblocks, stride);
+      errorQuda("aborting with error %s", cudaGetErrorString(err));
+    }
+    device_free(tmp);
+  }
+#endif
+
   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;
 
@@ -312,9 +341,34 @@ MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir,
 /**
  * Strided receive from the "dir" direction in the "dim" dimension
  */
-MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir, 
-						 size_t blksize, int nblocks, size_t stride)
+MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
+						  void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
 {
+#ifdef HOST_DEBUG
+  cudaPointerAttributes attributes;
+  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
+  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
+    // test this memory allocation is ok by filling it
+    try {
+      for (int i=0; i<nblocks; i++)
+	std::fill(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, 0);
+    } catch(std::exception &e) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
+		 file, line, func, dim, dir, blksize, nblocks, stride);
+      errorQuda("aborting");
+    }
+    if (err != cudaSuccess) cudaGetLastError();
+  } else {
+    // test this memory allocation is ok by doing a memset
+    cudaError_t err = cudaMemset2D(buffer, stride, 0, blksize, nblocks);
+    if (err != cudaSuccess) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
+		 file, line, func, dim, dir, blksize, nblocks, stride);
+      errorQuda("aborting with error %s", cudaGetErrorString(err));
+    }
+  }
+#endif
+
   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;
 

From 634d17075bb4f13b397bad0209cf686df791ccdd Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Wed, 13 May 2015 12:33:45 -0700
Subject: [PATCH 6/7] Only allocate receive messahe handlers for the requested
 number of faces.

---
 lib/cuda_color_spinor_field.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/cuda_color_spinor_field.cu b/lib/cuda_color_spinor_field.cu
index 89ffe8d2b4..3e1bdb1581 100644
--- a/lib/cuda_color_spinor_field.cu
+++ b/lib/cuda_color_spinor_field.cu
@@ -1042,15 +1042,15 @@ namespace quda {
 
 	  if(precision == QUDA_HALF_PRECISION){
             for(int b=0; b<2; ++b){
-	      mh_recv_norm_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm);
-	      mh_recv_norm_back[b][j][i] = comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, nbytes_Nface_norm);
+	      mh_recv_norm_fwd[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm) : NULL;
+	      mh_recv_norm_back[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, nbytes_Nface_norm) : NULL;
             }
 	  }
 #endif // GPU_COMMS
 
 	  for(int b=0; b<2; ++b){
-	    mh_recv_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_face[b][i], i, +1, nbytes_Nface);
-	    mh_recv_back[b][j][i] = comm_declare_receive_relative(from_back_face[b][i], i, -1, nbytes_Nface);
+	    mh_recv_fwd[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_fwd_face[b][i], i, +1, nbytes_Nface) : NULL;
+	    mh_recv_back[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_back_face[b][i], i, -1, nbytes_Nface) : NULL;
 	  }
 	 
 
@@ -1071,15 +1071,15 @@ namespace quda {
       for (int j=0; j<maxNface; j++) {
 	for (int i=0; i<nDimComms; i++) {
 	  if (commDimPartitioned(i)) {
-	    comm_free(mh_recv_fwd[b][j][i]);
-	    comm_free(mh_recv_back[b][j][i]);
+	    if (mh_recv_fwd[b][j][i]) comm_free(mh_recv_fwd[b][j][i]);
+	    if (mh_recv_fwd[b][j][i]) comm_free(mh_recv_back[b][j][i]);
 	    if (mh_send_fwd[b][j][2*i]) comm_free(mh_send_fwd[b][j][2*i]);
 	    if (mh_send_back[b][j][2*i]) comm_free(mh_send_back[b][j][2*i]);
 	    // only in a special case are these not aliasing pointers
 #ifdef GPU_COMMS
 	    if(precision == QUDA_HALF_PRECISION){
-	      comm_free(mh_recv_norm_fwd[b][j][i]);
-	      comm_free(mh_recv_norm_back[b][j][i]);
+	      if (mh_recv_norm_fwd[b][j][i]) comm_free(mh_recv_norm_fwd[b][j][i]);
+	      if (mh_recv_norm_back[b][j][i]) comm_free(mh_recv_norm_back[b][j][i]);
 	      if (mh_send_norm_fwd[b][j][2*i]) comm_free(mh_send_norm_fwd[b][j][2*i]);
 	      if (mh_send_norm_back[b][j][2*i]) comm_free(mh_send_norm_back[b][j][2*i]);
 	    }

From 00b01f1abfb18ac8936115a6bdbcef201fd6e100 Mon Sep 17 00:00:00 2001
From: M Clark <mclark@nvidia.com>
Date: Wed, 13 May 2015 12:35:19 -0700
Subject: [PATCH 7/7] Always allocate a ghost zone of one for Wilson-like and
 three for staggered fermions.

---
 lib/color_spinor_field.cpp | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 382d393ec3..6cee431084 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -55,17 +55,24 @@ namespace quda {
     // FIXME - The ghost zone is allocated before we know which
     // operator (and hence number of faces are needed), thus we
     // allocate a ghost zone large enough to cope with the maximum
-    // number of faces (maxNface).  This can artificially raise the
-    // GPU memory requirements.  One potential future solution may be
-    // to separate the ghost zone memory allocation from the field
-    // itself, which has other benefits (1. on multi-gpu machines with
-    // UVA, we can read the ghost zone directly from the neighbouring
-    // field and 2.) we can use a single contiguous buffer for the
-    // ghost zone and its norm which will reduce latency for half
-    // precision and allow us to enable GPU_COMMS support for half
-    // precision).
-    int num_faces = ((nSpin == 1) ? 2 : 1) * maxNface;
-    int num_norm_faces = 2*maxNface;
+    // number of faces.  All Wilson-like operators support only
+    // involve the excahnge of one face so this is no problem.
+    // However, for staggered fermions, we have either nFace=1 or 3,
+    // thus we allocated using the latter.  This will artificially
+    // raise the GPU memory requirements for naive staggered fermions.
+    // One potential future solution may be to separate the ghost zone
+    // memory allocation from the field itself, which has other
+    // benefits (1. on multi-gpu machines with UVA, we can read the
+    // ghost zone directly from the neighbouring field and 2.) we can
+    // use a single contiguous buffer for the ghost zone and its norm
+    // which will reduce latency for half precision and allow us to
+    // enable GPU_COMMS support for half precision).
+    int nFaceGhost = (nSpin == 1) ? 3 : 1;
+
+    // For Wilson we have the number of effective faces since the
+    // fields are spin projected.
+    int num_faces = ((nSpin == 1) ? 2 : 1) * nFaceGhost;
+    int num_norm_faces = 2*nFaceGhost;
 
     // calculate size of ghost zone required
     int ghostVolume = 0;