Removed many warning messages when compiling for single GPU. Fixed a …

…bug in KS force computation. Partially enabled building on Tesla architectures (more to do).
lattice · Nov 25, 2014 · c171d23 · c171d23
1 parent 8077dc7
commit c171d23
Show file tree

Hide file tree

Showing 16 changed files with 71 additions and 24 deletions.
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
@@ -437,7 +437,7 @@ namespace quda {
       Reconstruct<reconLen,Float> reconstruct;
       Float *gauge[2];
       Float *ghost[4];
-      int faceVolumeCB[QUDA_MAX_DIM];
+      int faceVolumeCB[4];
       const int volumeCB;
       const int stride;
       const int geometry;
@@ -572,6 +572,9 @@ namespace quda {
 
       __device__ __host__ inline void loadGhostEx(RegType v[length], int buff_idx, int extended_idx, int dir, 
 						  int dim, int g, int parity, const int R[]) const {
+#if __COMPUTE_CAPABILITY__ < 200
+	const int hasPhase = 0;
+#endif
 	const int M = reconLen / N;
 	RegType tmp[reconLen];
 	for (int i=0; i<M; i++) {
@@ -592,6 +595,9 @@ namespace quda {
 
       __device__ __host__ inline void saveGhostEx(const RegType v[length], int buff_idx, int extended_idx, 
 						  int dir, int dim, int g, int parity, const int R[]) {
+#if __COMPUTE_CAPABILITY__ < 200
+	const int hasPhase = 0;
+#endif
 	const int M = reconLen / N;
 	RegType tmp[reconLen];
 	// use the extended_idx to determine the boundary condition

diff --git a/include/texture.h b/include/texture.h
@@ -126,7 +126,7 @@ template<typename OutputType, typename InputType, int tex_id>
   { 
     // only bind if bytes > 0
     if (x->Bytes()) { 
-      if (tex_id_table[tex_id] && tex_id > 0 && tex_id <= MAX_TEX_ID) {
+      if (tex_id > 0 && tex_id <= MAX_TEX_ID && tex_id_table[tex_id]) {
 	errorQuda("Already bound to this texture reference");
       } else {
 	tex_id_table[tex_id] = true;

diff --git a/lib/copy_gauge_extended.cu b/lib/copy_gauge_extended.cu
@@ -105,8 +105,12 @@ namespace quda {
       if (location == QUDA_CPU_FIELD_LOCATION) {
 	copyGaugeEx<FloatOut, FloatIn, length>(arg);
       } else if (location == QUDA_CUDA_FIELD_LOCATION) {
+#if (__COMPUTE_CAPABILITY__ >= 200)
 	copyGaugeExKernel<FloatOut, FloatIn, length, OutOrder, InOrder> 
 	  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
+#else
+	errorQuda("Extended gauge copy not supported on pre-Fermi architecture");
+#endif
       }
     }
 
@@ -124,8 +128,12 @@ namespace quda {
     long long flops() const { return 0; } 
     long long bytes() const { 
       int sites = 4*arg.volume/2;
+#if (__COMPUTE_CAPABILITY__ >= 200)
       return 2 * sites * (  arg.in.Bytes() + arg.in.hasPhase*sizeof(FloatIn) 
                           + arg.out.Bytes() + arg.out.hasPhase*sizeof(FloatOut) ); 
+#else
+      return 2 * sites * (  arg.in.Bytes() + arg.out.Bytes() );
+#endif
     } 
   };
 

diff --git a/lib/copy_gauge_inc.cu b/lib/copy_gauge_inc.cu
@@ -142,13 +142,17 @@ namespace quda {
 
     void apply(const cudaStream_t &stream) {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+#if (__COMPUTE_CAPABILITY__ >= 200)
       if (!isGhost) {
 	copyGaugeKernel<FloatOut, FloatIn, length, OutOrder, InOrder> 
 	  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
       } else {
 	copyGhostKernel<FloatOut, FloatIn, length, OutOrder, InOrder> 
 	  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
       }
+#else
+      errorQuda("Gauge copy not supported on pre-Fermi architecture");
+#endif
     }
 
     TuneKey tuneKey() const { return TuneKey(vol, typeid(*this).name(), aux); }

diff --git a/lib/cuda_color_spinor_field.cu b/lib/cuda_color_spinor_field.cu
@@ -604,6 +604,7 @@ namespace quda {
 				       const int dagger, cudaStream_t *stream, 
 				       void *buffer, double a) 
   {
+#ifdef MULTI_GPU
     int face_num;
     if(dir == QUDA_BACKWARDS){
       face_num = 0;
@@ -612,7 +613,6 @@ namespace quda {
     }else{
       face_num = 2;
     }
-#ifdef MULTI_GPU
     void *packBuffer = buffer ? buffer : ghostFaceBuffer[bufferIndex];
     packFace(packBuffer, *this, clov, clovInv, nFace, dagger, parity, dim, face_num, *stream, a); 
 #else
@@ -627,6 +627,7 @@ namespace quda {
 				       const int dagger, cudaStream_t *stream, 
 				       void *buffer, double a, double b) 
   {
+#ifdef MULTI_GPU
     int face_num;
     if(dir == QUDA_BACKWARDS){
       face_num = 0;
@@ -635,7 +636,6 @@ namespace quda {
     }else{
       face_num = 2;
     }
-#ifdef MULTI_GPU
     void *packBuffer = buffer ? buffer : ghostFaceBuffer[bufferIndex];
     packFace(packBuffer, *this, nFace, dagger, parity, dim, face_num, *stream, a, b); 
 #else
@@ -786,6 +786,7 @@ namespace quda {
                                        const int dagger, cudaStream_t *stream,
                                        void *buffer)
   {
+#ifdef MULTI_GPU
     int face_num;
     if(dir == QUDA_BACKWARDS){
       face_num = 0;
@@ -794,7 +795,6 @@ namespace quda {
     }else{
       face_num = 2;
     }
-#ifdef MULTI_GPU
     void *packBuffer = buffer ? buffer : ghostFaceBuffer[bufferIndex];
     packFaceExtended(packBuffer, *this, nFace, R, dagger, parity, dim, face_num, *stream);
 #else

diff --git a/lib/dslash_core/staggered_dslash_core.h b/lib/dslash_core/staggered_dslash_core.h
@@ -344,10 +344,12 @@ spinorFloat o02_im;
 
   const int X1X0 = X[1]*X[0];
   const int X2X1X0 = X[2]*X1X0;
+#ifdef MULTI_GPU
 #if (DD_IMPROVED == 1)
   const int X3X1X0 = X[3]*X1X0;
 #endif
   const int half_volume = (X[0]*X[1]*X[2]*X[3] >> 1);
+#endif
 
   int za,zb; 
   int x0h;

diff --git a/lib/dslash_core/staggered_fused_exterior_dslash_core.h b/lib/dslash_core/staggered_fused_exterior_dslash_core.h
@@ -346,23 +346,18 @@ spinorFloat o02_im;
 
   const int X1X0 = X[1]*X[0];
   const int X2X1X0 = X[2]*X1X0;
-#if (DD_IMPROVED == 1)
+#if (DD_IMPROVED == 1 && defined(MULTI_GPU))
   const int X3X1X0 = X[3]*X1X0;
 #endif
-  const int half_volume = (X[0]*X[1]*X[2]*X[3] >> 1);
 
-  int za,zb; 
+int y[4] = {};
+
+int full_idx=0;
+#ifdef MULTI_GPU
   int x0h;
-  int y[4];
   int x0odd;
-  int full_idx;
-  int half_idx;
-
-
   bool active = false;
-  int dim=4;
-#ifdef MULTI_GPU
-  dim = dimFromFaceIndex (idx, param);
+  int dim = dimFromFaceIndex (idx, param);
 
   if(dim == 0){
     coordsFromFaceIndexStaggered<NFACE,2>(y, idx, param.parity, EXTERIOR_KERNEL_X, X);
@@ -374,9 +369,12 @@ spinorFloat o02_im;
     coordsFromFaceIndexStaggered<NFACE,2>(y, idx, param.parity, EXTERIOR_KERNEL_T, X);
   }
 
+  int za,zb; 
+
+  const int half_volume = (X[0]*X[1]*X[2]*X[3] >> 1);
   full_idx = ((y[3]*X[2] +y[2])*X[1] +y[1])*X[0]+y[0];
-  half_idx = full_idx>>1;
 #endif // MULTI_GPU
+  int half_idx = full_idx>>1;
 
 
 

diff --git a/lib/dslash_pack.cu b/lib/dslash_pack.cu
@@ -30,11 +30,14 @@ namespace quda {
 
   using namespace pack;
 
+#ifdef MULTI_GPU
   static int commDim[QUDA_MAX_DIM]; // Whether to do comms or not
   void setPackComms(const int *comm_dim) {
     for (int i=0; i<QUDA_MAX_DIM; i++) commDim[i] = comm_dim[i];
   }
-
+#else
+  void setPackComms(const int *comm_dim) { ; }
+#endif
 
 #include <dslash_index.cuh>
 

diff --git a/lib/dslash_policy.cuh b/lib/dslash_policy.cuh
@@ -982,8 +982,8 @@ struct DslashFusedExterior : DslashPolicyImp {
     dslashParam.kernel_type = INTERIOR_KERNEL;
     dslashParam.threads = volume;
 
-    int scatterIndex = 0;
 #ifdef MULTI_GPU
+    int scatterIndex = 0;
     // Record the start of the dslash if doing communication in T and not kernel packing
     if (dslashParam.commDim[3] && !(getKernelPackT() || getTwistPack())) 
     {

diff --git a/lib/dslash_quda.cu b/lib/dslash_quda.cu
@@ -42,7 +42,7 @@
 #endif
 #endif // GPU_STAGGERED_DIRAC
 
-#include <quda_internal.h>h
+#include <quda_internal.h>
 #include <dslash_quda.h>
 #include <sys/time.h>
 #include <blas_quda.h>

diff --git a/lib/dslash_twisted_clover.cu b/lib/dslash_twisted_clover.cu
@@ -36,7 +36,9 @@ namespace quda {
     //#define SHARED_WILSON_DSLASH
     //#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access
 
+#if (__COMPUTE_CAPABILITY__ >= 200)
 #include <tmc_dslash_def.h>       // Twisted Clover kernels
+#endif
 
 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
@@ -123,6 +125,7 @@ namespace quda {
 #endif
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
 
+#if (__COMPUTE_CAPABILITY__ >= 200)
       switch(dslashType){
 
       case QUDA_DEG_CLOVER_TWIST_INV_DSLASH:
@@ -142,6 +145,9 @@ namespace quda {
 	break;
       default: errorQuda("Invalid twisted clover dslash type");
       }
+#else
+      errorQuda("Twisted-clover fermions not supported on pre-Fermi architecture");
+#endif
     }
 
     long long flops() const { return (x ? 1416ll : 1392ll) * in->VolumeCB(); } // FIXME for multi-GPU

diff --git a/lib/extract_gauge_ghost.cu b/lib/extract_gauge_ghost.cu
@@ -137,9 +137,13 @@ namespace quda {
     virtual ~ExtractGhost() { ; }
 
     void apply(const cudaStream_t &stream) {
+#if (__COMPUTE_CAPABILITY__ >= 200)
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       extractGhostKernel<Float, length, nDim, Order> 
 	<<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
+#else
+      errorQuda("extractGhost not supported on pre-Fermi architecture");
+#endif
     }
 
     TuneKey tuneKey() const { return TuneKey(vol, typeid(*this).name(), aux); }

diff --git a/lib/extract_gauge_ghost_extended.cu b/lib/extract_gauge_ghost_extended.cu
@@ -220,21 +220,30 @@ namespace quda {
 	if (location==QUDA_CPU_FIELD_LOCATION) {
 	  extractGhostEx<Float,length,nDim,Order,true>(arg);
 	} else {
+#if (__COMPUTE_CAPABILITY__ >= 200)
 	  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
 	  tp.grid.y = 2;
 	  tp.grid.z = 2;
 	  extractGhostExKernel<Float,length,nDim,Order,true> 
 	    <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
+#else
+      errorQuda("extractGhostEx not supported on pre-Fermi architecture");
+#endif
+
 	}
       } else { // we are injecting
 	if (location==QUDA_CPU_FIELD_LOCATION) {
 	  extractGhostEx<Float,length,nDim,Order,false>(arg);
 	} else {
+#if (__COMPUTE_CAPABILITY__ >= 200)
 	  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
 	  tp.grid.y = 2;
 	  tp.grid.z = 2;
 	  extractGhostExKernel<Float,length,nDim,Order,false> 
 	    <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
+#else
+      errorQuda("extractGhostEx not supported on pre-Fermi architecture");
+#endif
 	}
       }
     }

diff --git a/lib/ks_force_quda.cu b/lib/ks_force_quda.cu
@@ -122,10 +122,6 @@ namespace quda {
     {
       int idx = threadIdx.x + blockIdx.x*blockDim.x;
 
-      if(idx==0){ 
-        printf("arg.threads = %d\n", arg.threads);
-      }
-
       if(idx >= arg.threads) return;
       completeKSForceCore<Float,Oprod,Gauge,Mom>(arg,idx);
     }
@@ -168,10 +164,14 @@ namespace quda {
 
       void apply(const cudaStream_t &stream) {
         if(location == QUDA_CUDA_FIELD_LOCATION){
+#if (__COMPUTE_CAPABILITY__ >= 200)
           // Fix this
           dim3 blockDim(128, 1, 1);
           dim3 gridDim((arg.threads + blockDim.x - 1) / blockDim.x, 1, 1);
           completeKSForceKernel<Float><<<gridDim,blockDim>>>(arg);
+#else
+	  errorQuda("completeKSForce not supported on pre-Fermi architecture");
+#endif
         }else{
           completeKSForceCPU<Float>(arg);
         }
@@ -385,10 +385,14 @@ class KSLongLinkForce : Tunable {
 
   void apply(const cudaStream_t &stream) {
     if(location == QUDA_CUDA_FIELD_LOCATION){
+#if (__COMPUTE_CAPABILITY__ >= 200)
       // Fix this
       dim3 blockDim(128, 1, 1);
       dim3 gridDim((arg.threads + blockDim.x - 1) / blockDim.x, 1, 1);
       computeKSLongLinkForceKernel<Float><<<gridDim,blockDim>>>(arg);
+#else
+      errorQuda("computeKSLongLinkForce not supported on pre-Fermi architecture");
+#endif
     }else{
       computeKSLongLinkForceCPU<Float>(arg);
     }

diff --git a/lib/misc_helpers.cu b/lib/misc_helpers.cu
@@ -540,6 +540,7 @@ namespace quda {
 
       case 2:
 	switch(whichway){
+	case QUDA_BACKWARDS:
 	  collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity, param);
 	  collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity, param);
 	  break;

diff --git a/make.inc.in b/make.inc.in
@@ -106,6 +106,8 @@ NVCCOPT += -D__COMPUTE_CAPABILITY__=$(COMP_CAP)
 TESLA_ARCH = $(shell [ $(COMP_CAP) -lt 200 ] && echo true)
 ifneq ($(TESLA_ARCH),true)
   NVCCOPT += -ftz=true -prec-div=false -prec-sqrt=false -w
+else
+  NVCCOPT += -w
 endif
 
 ifeq ($(strip $(BUILD_MULTI_GPU)), yes)