diff --git a/include/comm_quda.h b/include/comm_quda.h
index 5deffc865a..06bbac2578 100644
--- a/include/comm_quda.h
+++ b/include/comm_quda.h
@@ -29,25 +29,39 @@ extern "C" {
   int comm_coord(int dim);
 
   /**
-     Create a persistent message handler for a relative send
+     Create a persistent message handler for a relative send.  This
+     should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer from which message will be sent
      @param dim Dimension in which message will be sent
      @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
      @param nbytes Size of message in bytes
   */
-  MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes);
+  MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
+					 void *buffer, int dim, int dir, size_t nbytes);
+
+#define comm_declare_send_relative(buffer, dim, dir, nbytes)		\
+  comm_declare_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)
 
   /**
-     Create a persistent message handler for a relative receive
+     Create a persistent message handler for a relative send.  This
+     should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer into which message will be received
      @param dim Dimension from message will be received
      @param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
      @param nbytes Size of message in bytes
   */
-  MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes);
+  MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
+					    void *buffer, int dim, int dir, size_t nbytes);
+
+#define comm_declare_receive_relative(buffer, dim, dir, nbytes)		\
+  comm_declare_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)
 
   /**
-     Create a persistent strided message handler for a relative send
+     Create a persistent strided message handler for a relative send.
+     This should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer from which message will be sent
      @param dim Dimension in which message will be sent
      @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
@@ -55,11 +69,17 @@ extern "C" {
      @param nblocks Number of blocks
      @param stride Stride between blocks in bytes
   */
-  MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, 
-						size_t blksize, int nblocks, size_t stride);
+  MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
+						 void *buffer, int dim, int dir,
+						 size_t blksize, int nblocks, size_t stride);
+
+#define comm_declare_strided_send_relative(buffer, dim, dir, blksize, nblocks, stride) \
+  comm_declare_strided_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)
 
   /**
      Create a persistent strided message handler for a relative receive
+     This should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer into which message will be received
      @param dim Dimension from message will be received
      @param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
@@ -67,8 +87,12 @@ extern "C" {
      @param nblocks Number of blocks
      @param stride Stride between blocks in bytes
   */
-  MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir, 
-						   size_t blksize, int nblocks, size_t stride);
+  MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
+						    void *buffer, int dim, int dir,
+						    size_t blksize, int nblocks, size_t stride);
+
+#define comm_declare_strided_receive_relative(buffer, dim, dir, blksize, nblocks, stride) \
+  comm_declare_strided_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)
 
   void comm_finalize(void);
   void comm_dim_partitioned_set(int dim);
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 97a09cf54b..6cee431084 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -52,32 +52,27 @@ namespace quda {
     if (getVerbosity() == QUDA_DEBUG_VERBOSE) 
       printfQuda("Precision = %d, Subset = %d\n", precision, siteSubset);
 
-    int num_faces = 1;
-    int num_norm_faces=2;
-
-    // FIXME - this is a hack from hell that needs to be fixed.  When
-    // the TIFR interface is enabled we are forcing naive staggered
-    // support which breaks asqtad/hisq fermions.  The problem occurs
-    // because the ghost zone is allocated before we know which
-    // operator (and hence number of faces are needed).  One solution
-    // may be to separate the ghost zone memory allocation from the
-    // field itself, which has other benefits (1. on multi-gpu
-    // machines with UVA, we can read the ghost zone directly from the
-    // neighbouring field and 2.) we can use a single contiguous
-    // buffer for the ghost zone and its norm which will reduce
-    // latency for half precision and allow us to enable GPU_COMMS
-    // support for half precision).
-#ifdef BUILD_TIFR_INTERFACE
-    if (nSpin == 1) { //staggered
-      num_faces=2;
-      num_norm_faces=2;
-    }
-#else
-    if (nSpin == 1) { // improved staggered
-      num_faces=6;
-      num_norm_faces=6;
-    }
-#endif
+    // FIXME - The ghost zone is allocated before we know which
+    // operator (and hence number of faces are needed), thus we
+    // allocate a ghost zone large enough to cope with the maximum
+    // number of faces.  All Wilson-like operators support only
+    // involve the excahnge of one face so this is no problem.
+    // However, for staggered fermions, we have either nFace=1 or 3,
+    // thus we allocated using the latter.  This will artificially
+    // raise the GPU memory requirements for naive staggered fermions.
+    // One potential future solution may be to separate the ghost zone
+    // memory allocation from the field itself, which has other
+    // benefits (1. on multi-gpu machines with UVA, we can read the
+    // ghost zone directly from the neighbouring field and 2.) we can
+    // use a single contiguous buffer for the ghost zone and its norm
+    // which will reduce latency for half precision and allow us to
+    // enable GPU_COMMS support for half precision).
+    int nFaceGhost = (nSpin == 1) ? 3 : 1;
+
+    // For Wilson we have the number of effective faces since the
+    // fields are spin projected.
+    int num_faces = ((nSpin == 1) ? 2 : 1) * nFaceGhost;
+    int num_norm_faces = 2*nFaceGhost;
 
     // calculate size of ghost zone required
     int ghostVolume = 0;
diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp
index e1ff605351..9285a07209 100644
--- a/lib/comm_common.cpp
+++ b/lib/comm_common.cpp
@@ -1,4 +1,5 @@
 #include <unistd.h> // for gethostname()
+#include <assert.h>
 
 #include <quda_internal.h>
 #include <comm_quda.h>
@@ -226,20 +227,69 @@ int comm_coord(int dim)
 /**
  * Send to the "dir" direction in the "dim" dimension
  */
-MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
+MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
+				       void *buffer, int dim, int dir, size_t nbytes)
 {
+#ifdef HOST_DEBUG
+  cudaPointerAttributes attributes;
+  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
+  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
+    // test this memory allocation is ok by doing a memcpy from it
+    void *tmp = safe_malloc(nbytes);
+    try {
+      std::copy(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, static_cast<char*>(tmp));
+    } catch(std::exception &e) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
+      errorQuda("aborting");
+    }
+    if (err != cudaSuccess) cudaGetLastError();
+    host_free(tmp);
+  } else {
+    // test this memory allocation is ok by doing a memcpy from it
+    void *tmp = device_malloc(nbytes);
+    cudaError_t err = cudaMemcpy(tmp, buffer, nbytes, cudaMemcpyDeviceToDevice);
+    if (err != cudaSuccess) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
+      errorQuda("aborting with error %s", cudaGetErrorString(err));
+    }
+    device_free(tmp);
+  }
+#endif
+
   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;
 
   return comm_declare_send_displaced(buffer, disp, nbytes);
 }
 
-
 /**
  * Receive from the "dir" direction in the "dim" dimension
  */
-MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
+MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
+					  void *buffer, int dim, int dir, size_t nbytes)
 {
+#ifdef HOST_DEBUG
+  cudaPointerAttributes attributes;
+  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
+  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
+    // test this memory allocation is ok by filling it
+    try {
+      std::fill(static_cast<char*>(buffer), static_cast<char*>(buffer)+nbytes, 0);
+    } catch(std::exception &e) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
+      errorQuda("aborting");
+    }
+    if (err != cudaSuccess) cudaGetLastError();
+  } else {
+    // test this memory allocation is ok by doing a memset
+    cudaError_t err = cudaMemset(buffer, 0, nbytes);
+    if (err != cudaSuccess) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, nbytes=%zu)\n", file, line, func, dim, dir, nbytes);
+      errorQuda("aborting with error %s", cudaGetErrorString(err));
+    }
+  }
+#endif
+
   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;
 
@@ -249,9 +299,38 @@ MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t
 /**
  * Strided send to the "dir" direction in the "dim" dimension
  */
-MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, 
-					      size_t blksize, int nblocks, size_t stride)
+MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
+					       void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
 {
+#ifdef HOST_DEBUG
+  cudaPointerAttributes attributes;
+  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
+  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
+    // test this memory allocation is ok by doing a memcpy from it
+    void *tmp = safe_malloc(blksize*nblocks);
+    try {
+      for (int i=0; i<nblocks; i++)
+	std::copy(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, static_cast<char*>(tmp));
+    } catch(std::exception &e) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
+		 file, line, func, dim, dir, blksize, nblocks, stride);
+      errorQuda("aborting");
+      }
+    host_free(tmp);
+    if (err != cudaSuccess) cudaGetLastError();
+  } else {
+    // test this memory allocation is ok by doing a memcpy from it
+    void *tmp = device_malloc(blksize*nblocks);
+    cudaError_t err = cudaMemcpy2D(tmp, blksize, buffer, stride, blksize, nblocks, cudaMemcpyDeviceToDevice);
+    if (err != cudaSuccess) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
+		 file, line, func, dim, dir, blksize, nblocks, stride);
+      errorQuda("aborting with error %s", cudaGetErrorString(err));
+    }
+    device_free(tmp);
+  }
+#endif
+
   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;
 
@@ -262,9 +341,34 @@ MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir,
 /**
  * Strided receive from the "dir" direction in the "dim" dimension
  */
-MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir, 
-						 size_t blksize, int nblocks, size_t stride)
+MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
+						  void *buffer, int dim, int dir, size_t blksize, int nblocks, size_t stride)
 {
+#ifdef HOST_DEBUG
+  cudaPointerAttributes attributes;
+  cudaError_t err = cudaPointerGetAttributes(&attributes, buffer);
+  if (err != cudaSuccess || attributes.memoryType == cudaMemoryTypeHost) {
+    // test this memory allocation is ok by filling it
+    try {
+      for (int i=0; i<nblocks; i++)
+	std::fill(static_cast<char*>(buffer)+i*stride, static_cast<char*>(buffer)+i*stride+blksize, 0);
+    } catch(std::exception &e) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
+		 file, line, func, dim, dir, blksize, nblocks, stride);
+      errorQuda("aborting");
+    }
+    if (err != cudaSuccess) cudaGetLastError();
+  } else {
+    // test this memory allocation is ok by doing a memset
+    cudaError_t err = cudaMemset2D(buffer, stride, 0, blksize, nblocks);
+    if (err != cudaSuccess) {
+      printfQuda("ERROR: buffer failed (%s:%d in %s(), dim=%d, dir=%d, blksize=%zu nblocks=%d stride=%zu)\n",
+		 file, line, func, dim, dir, blksize, nblocks, stride);
+      errorQuda("aborting with error %s", cudaGetErrorString(err));
+    }
+  }
+#endif
+
   int disp[QUDA_MAX_DIM] = {0};
   disp[dim] = dir;
 
diff --git a/lib/cuda_color_spinor_field.cu b/lib/cuda_color_spinor_field.cu
index 5611178541..3e1bdb1581 100644
--- a/lib/cuda_color_spinor_field.cu
+++ b/lib/cuda_color_spinor_field.cu
@@ -552,7 +552,6 @@ namespace quda {
     // only allocate if not already allocated or buffer required is bigger than previously
     if(initGhostFaceBuffer == 0 || faceBytes > ghostFaceBytes){    
 
-
       if (initGhostFaceBuffer){
         for(int b=0; b<2; ++b) device_free(ghostFaceBuffer[b]); 
       }
@@ -945,13 +944,14 @@ namespace quda {
 	  if (!commDimPartitioned(i)) continue;
 #ifdef GPU_COMMS
 	  size_t nbytes_Nface = surfaceCB[i]*Ndof*precision*(j+1);
+	  size_t nbytes_Nface_norm = surfaceCB[i]*(j+1)*sizeof(float);
 	  if (i != 3 || getKernelPackT() || getTwistPack()) {
 #else 
 	    size_t nbytes_Nface = (nbytes[i] / maxNface) * (j+1);
 #endif
 	    for(int b=0; b<2; ++b){
-	      mh_send_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_face[b][i], i, +1, nbytes_Nface);
-	      mh_send_back[b][j][2*i+0] = comm_declare_send_relative(my_back_face[b][i], i, -1, nbytes_Nface);
+	      mh_send_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_fwd_face[b][i], i, +1, nbytes_Nface) : NULL;
+	      mh_send_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_back_face[b][i], i, -1, nbytes_Nface) : NULL;
 	      mh_send_fwd[b][j][2*i+1] = mh_send_fwd[b][j][2*i]; // alias pointers
 	      mh_send_back[b][j][2*i+1] = mh_send_back[b][j][2*i]; // alias pointers
 	    }
@@ -959,8 +959,8 @@ namespace quda {
 
 	    if(precision == QUDA_HALF_PRECISION){
 	      for(int b=0; b<2; ++b){
-		mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, surfaceCB[i]*(j+1)*sizeof(float)); 
-		mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(my_back_norm_face[b][i], i, -1, surfaceCB[i]*(j+1)*sizeof(float));
+		mh_send_norm_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm) : NULL;
+		mh_send_norm_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(my_back_norm_face[b][i], i, -1, nbytes_Nface_norm) : NULL;
 		mh_send_norm_fwd[b][j][2*i+1] = mh_send_norm_fwd[b][j][2*i];
 		mh_send_norm_back[b][j][2*i+1] = mh_send_norm_back[b][j][2*i]; 	
 	      }
@@ -1012,11 +1012,12 @@ namespace quda {
 	    //printf("%d strided sends with Nface=%d Nblocks=%d blksize=%d Stride=%d\n", i, j+1, Nblocks, blksize, Stride);
 
             for(int b=0; b<2; ++b){
-	      mh_send_fwd[b][j][2*i+0] = comm_declare_strided_send_relative(base[2], i, +1, blksize, Nblocks, Stride);
-	      mh_send_back[b][j][2*i+0] = comm_declare_strided_send_relative(base[0], i, -1, blksize, Nblocks, Stride);
+	      // only allocate a communicator for the present face (this needs cleaned up)
+	      mh_send_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[2], i, +1, blksize, Nblocks, Stride) : NULL;
+	      mh_send_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[0], i, -1, blksize, Nblocks, Stride) : NULL;
 	      if (nSpin ==4) { // dagger communicators
-	        mh_send_fwd[b][j][2*i+1] = comm_declare_strided_send_relative(base[3], i, +1, blksize, Nblocks, Stride);
-	        mh_send_back[b][j][2*i+1] = comm_declare_strided_send_relative(base[1], i, -1, blksize, Nblocks, Stride);
+	        mh_send_fwd[b][j][2*i+1] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[3], i, +1, blksize, Nblocks, Stride) : NULL;
+	        mh_send_back[b][j][2*i+1] = (j+1 == nFace) ? comm_declare_strided_send_relative(base[1], i, -1, blksize, Nblocks, Stride) : NULL;
 	      } else {
 	        mh_send_fwd[b][j][2*i+1] = mh_send_fwd[b][j][2*i+0];
 	        mh_send_back[b][j][2*i+1] = mh_send_back[b][j][2*i+0];
@@ -1030,8 +1031,8 @@ namespace quda {
 	      void *norm_fwd = static_cast<float*>(norm) + Nt_minus1_offset;
 	      void *norm_back = norm; // the first time slice has zero offset
 	      for(int b=0; b<2; ++b){
-		mh_send_norm_fwd[b][j][2*i+0] = comm_declare_send_relative(norm_fwd, i, +1, surfaceCB[i]*(j+1)*sizeof(float)); 
-		mh_send_norm_back[b][j][2*i+0] = comm_declare_send_relative(norm_back, i, -1, surfaceCB[i]*(j+1)*sizeof(float));
+		mh_send_norm_fwd[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(norm_fwd, i, +1, surfaceCB[i]*(j+1)*sizeof(float)) : NULL;
+		mh_send_norm_back[b][j][2*i+0] = (j+1 == nFace) ? comm_declare_send_relative(norm_back, i, -1, surfaceCB[i]*(j+1)*sizeof(float)) : NULL;
 		mh_send_norm_fwd[b][j][2*i+1] = mh_send_norm_fwd[b][j][2*i];
 		mh_send_norm_back[b][j][2*i+1] = mh_send_norm_back[b][j][2*i];  
 	      }
@@ -1041,15 +1042,15 @@ namespace quda {
 
 	  if(precision == QUDA_HALF_PRECISION){
             for(int b=0; b<2; ++b){
-	      mh_recv_norm_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, surfaceCB[i]*sizeof(float)*(j+1));
-	      mh_recv_norm_back[b][j][i] = comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, surfaceCB[i]*sizeof(float)*(j+1));
+	      mh_recv_norm_fwd[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_fwd_norm_face[b][i], i, +1, nbytes_Nface_norm) : NULL;
+	      mh_recv_norm_back[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_back_norm_face[b][i], i, -1, nbytes_Nface_norm) : NULL;
             }
 	  }
 #endif // GPU_COMMS
 
 	  for(int b=0; b<2; ++b){
-	    mh_recv_fwd[b][j][i] = comm_declare_receive_relative(from_fwd_face[b][i], i, +1, nbytes_Nface);
-	    mh_recv_back[b][j][i] = comm_declare_receive_relative(from_back_face[b][i], i, -1, nbytes_Nface);
+	    mh_recv_fwd[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_fwd_face[b][i], i, +1, nbytes_Nface) : NULL;
+	    mh_recv_back[b][j][i] = (j+1 == nFace) ? comm_declare_receive_relative(from_back_face[b][i], i, -1, nbytes_Nface) : NULL;
 	  }
 	 
 
@@ -1070,22 +1071,22 @@ namespace quda {
       for (int j=0; j<maxNface; j++) {
 	for (int i=0; i<nDimComms; i++) {
 	  if (commDimPartitioned(i)) {
-	    comm_free(mh_recv_fwd[b][j][i]);
-	    comm_free(mh_recv_back[b][j][i]);
-	    comm_free(mh_send_fwd[b][j][2*i]);
-	    comm_free(mh_send_back[b][j][2*i]);
+	    if (mh_recv_fwd[b][j][i]) comm_free(mh_recv_fwd[b][j][i]);
+	    if (mh_recv_fwd[b][j][i]) comm_free(mh_recv_back[b][j][i]);
+	    if (mh_send_fwd[b][j][2*i]) comm_free(mh_send_fwd[b][j][2*i]);
+	    if (mh_send_back[b][j][2*i]) comm_free(mh_send_back[b][j][2*i]);
 	    // only in a special case are these not aliasing pointers
 #ifdef GPU_COMMS
 	    if(precision == QUDA_HALF_PRECISION){
-	      comm_free(mh_recv_norm_fwd[b][j][i]);
-	      comm_free(mh_recv_norm_back[b][j][i]);
-	      comm_free(mh_send_norm_fwd[b][j][2*i]);
-	      comm_free(mh_send_norm_back[b][j][2*i]);
+	      if (mh_recv_norm_fwd[b][j][i]) comm_free(mh_recv_norm_fwd[b][j][i]);
+	      if (mh_recv_norm_back[b][j][i]) comm_free(mh_recv_norm_back[b][j][i]);
+	      if (mh_send_norm_fwd[b][j][2*i]) comm_free(mh_send_norm_fwd[b][j][2*i]);
+	      if (mh_send_norm_back[b][j][2*i]) comm_free(mh_send_norm_back[b][j][2*i]);
 	    }
 
 	    if (i == 3 && !getKernelPackT() && nSpin == 4) {
-	      comm_free(mh_send_fwd[b][j][2*i+1]);
-	      comm_free(mh_send_back[b][j][2*i+1]);
+	      if (mh_send_fwd[b][j][2*i+1]) comm_free(mh_send_fwd[b][j][2*i+1]);
+	      if (mh_send_back[b][j][2*i+1]) comm_free(mh_send_back[b][j][2*i+1]);
 	    }
 #endif // GPU_COMMS
 	  }