Skip to content

Commit

Permalink
Removed many warning messages when compiling for single GPU. Fixed a …
Browse files Browse the repository at this point in the history
…bug in KS force computation. Partially enabled building on Tesla architectures (more to do).
  • Loading branch information
maddyscientist committed Nov 25, 2014
1 parent 8077dc7 commit c171d23
Show file tree
Hide file tree
Showing 16 changed files with 71 additions and 24 deletions.
8 changes: 7 additions & 1 deletion include/gauge_field_order.h
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ namespace quda {
Reconstruct<reconLen,Float> reconstruct;
Float *gauge[2];
Float *ghost[4];
int faceVolumeCB[QUDA_MAX_DIM];
int faceVolumeCB[4];
const int volumeCB;
const int stride;
const int geometry;
Expand Down Expand Up @@ -572,6 +572,9 @@ namespace quda {

__device__ __host__ inline void loadGhostEx(RegType v[length], int buff_idx, int extended_idx, int dir,
int dim, int g, int parity, const int R[]) const {
#if __COMPUTE_CAPABILITY__ < 200
const int hasPhase = 0;
#endif
const int M = reconLen / N;
RegType tmp[reconLen];
for (int i=0; i<M; i++) {
Expand All @@ -592,6 +595,9 @@ namespace quda {

__device__ __host__ inline void saveGhostEx(const RegType v[length], int buff_idx, int extended_idx,
int dir, int dim, int g, int parity, const int R[]) {
#if __COMPUTE_CAPABILITY__ < 200
const int hasPhase = 0;
#endif
const int M = reconLen / N;
RegType tmp[reconLen];
// use the extended_idx to determine the boundary condition
Expand Down
2 changes: 1 addition & 1 deletion include/texture.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ template<typename OutputType, typename InputType, int tex_id>
{
// only bind if bytes > 0
if (x->Bytes()) {
if (tex_id_table[tex_id] && tex_id > 0 && tex_id <= MAX_TEX_ID) {
if (tex_id > 0 && tex_id <= MAX_TEX_ID && tex_id_table[tex_id]) {
errorQuda("Already bound to this texture reference");
} else {
tex_id_table[tex_id] = true;
Expand Down
8 changes: 8 additions & 0 deletions lib/copy_gauge_extended.cu
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,12 @@ namespace quda {
if (location == QUDA_CPU_FIELD_LOCATION) {
copyGaugeEx<FloatOut, FloatIn, length>(arg);
} else if (location == QUDA_CUDA_FIELD_LOCATION) {
#if (__COMPUTE_CAPABILITY__ >= 200)
copyGaugeExKernel<FloatOut, FloatIn, length, OutOrder, InOrder>
<<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
#else
errorQuda("Extended gauge copy not supported on pre-Fermi architecture");
#endif
}
}

Expand All @@ -124,8 +128,12 @@ namespace quda {
long long flops() const { return 0; }
long long bytes() const {
int sites = 4*arg.volume/2;
#if (__COMPUTE_CAPABILITY__ >= 200)
return 2 * sites * ( arg.in.Bytes() + arg.in.hasPhase*sizeof(FloatIn)
+ arg.out.Bytes() + arg.out.hasPhase*sizeof(FloatOut) );
#else
return 2 * sites * ( arg.in.Bytes() + arg.out.Bytes() );
#endif
}
};

Expand Down
4 changes: 4 additions & 0 deletions lib/copy_gauge_inc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,17 @@ namespace quda {

void apply(const cudaStream_t &stream) {
TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
#if (__COMPUTE_CAPABILITY__ >= 200)
if (!isGhost) {
copyGaugeKernel<FloatOut, FloatIn, length, OutOrder, InOrder>
<<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
} else {
copyGhostKernel<FloatOut, FloatIn, length, OutOrder, InOrder>
<<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
}
#else
errorQuda("Gauge copy not supported on pre-Fermi architecture");
#endif
}

TuneKey tuneKey() const { return TuneKey(vol, typeid(*this).name(), aux); }
Expand Down
6 changes: 3 additions & 3 deletions lib/cuda_color_spinor_field.cu
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,7 @@ namespace quda {
const int dagger, cudaStream_t *stream,
void *buffer, double a)
{
#ifdef MULTI_GPU
int face_num;
if(dir == QUDA_BACKWARDS){
face_num = 0;
Expand All @@ -612,7 +613,6 @@ namespace quda {
}else{
face_num = 2;
}
#ifdef MULTI_GPU
void *packBuffer = buffer ? buffer : ghostFaceBuffer[bufferIndex];
packFace(packBuffer, *this, clov, clovInv, nFace, dagger, parity, dim, face_num, *stream, a);
#else
Expand All @@ -627,6 +627,7 @@ namespace quda {
const int dagger, cudaStream_t *stream,
void *buffer, double a, double b)
{
#ifdef MULTI_GPU
int face_num;
if(dir == QUDA_BACKWARDS){
face_num = 0;
Expand All @@ -635,7 +636,6 @@ namespace quda {
}else{
face_num = 2;
}
#ifdef MULTI_GPU
void *packBuffer = buffer ? buffer : ghostFaceBuffer[bufferIndex];
packFace(packBuffer, *this, nFace, dagger, parity, dim, face_num, *stream, a, b);
#else
Expand Down Expand Up @@ -786,6 +786,7 @@ namespace quda {
const int dagger, cudaStream_t *stream,
void *buffer)
{
#ifdef MULTI_GPU
int face_num;
if(dir == QUDA_BACKWARDS){
face_num = 0;
Expand All @@ -794,7 +795,6 @@ namespace quda {
}else{
face_num = 2;
}
#ifdef MULTI_GPU
void *packBuffer = buffer ? buffer : ghostFaceBuffer[bufferIndex];
packFaceExtended(packBuffer, *this, nFace, R, dagger, parity, dim, face_num, *stream);
#else
Expand Down
2 changes: 2 additions & 0 deletions lib/dslash_core/staggered_dslash_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -344,10 +344,12 @@ spinorFloat o02_im;

const int X1X0 = X[1]*X[0];
const int X2X1X0 = X[2]*X1X0;
#ifdef MULTI_GPU
#if (DD_IMPROVED == 1)
const int X3X1X0 = X[3]*X1X0;
#endif
const int half_volume = (X[0]*X[1]*X[2]*X[3] >> 1);
#endif

int za,zb;
int x0h;
Expand Down
22 changes: 10 additions & 12 deletions lib/dslash_core/staggered_fused_exterior_dslash_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,23 +346,18 @@ spinorFloat o02_im;

const int X1X0 = X[1]*X[0];
const int X2X1X0 = X[2]*X1X0;
#if (DD_IMPROVED == 1)
#if (DD_IMPROVED == 1 && defined(MULTI_GPU))
const int X3X1X0 = X[3]*X1X0;
#endif
const int half_volume = (X[0]*X[1]*X[2]*X[3] >> 1);

int za,zb;
int y[4] = {};

int full_idx=0;
#ifdef MULTI_GPU
int x0h;
int y[4];
int x0odd;
int full_idx;
int half_idx;


bool active = false;
int dim=4;
#ifdef MULTI_GPU
dim = dimFromFaceIndex (idx, param);
int dim = dimFromFaceIndex (idx, param);

if(dim == 0){
coordsFromFaceIndexStaggered<NFACE,2>(y, idx, param.parity, EXTERIOR_KERNEL_X, X);
Expand All @@ -374,9 +369,12 @@ spinorFloat o02_im;
coordsFromFaceIndexStaggered<NFACE,2>(y, idx, param.parity, EXTERIOR_KERNEL_T, X);
}

int za,zb;

const int half_volume = (X[0]*X[1]*X[2]*X[3] >> 1);
full_idx = ((y[3]*X[2] +y[2])*X[1] +y[1])*X[0]+y[0];
half_idx = full_idx>>1;
#endif // MULTI_GPU
int half_idx = full_idx>>1;



Expand Down
5 changes: 4 additions & 1 deletion lib/dslash_pack.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,14 @@ namespace quda {

using namespace pack;

#ifdef MULTI_GPU
static int commDim[QUDA_MAX_DIM]; // Whether to do comms or not
void setPackComms(const int *comm_dim) {
for (int i=0; i<QUDA_MAX_DIM; i++) commDim[i] = comm_dim[i];
}

#else
void setPackComms(const int *comm_dim) { ; }
#endif

#include <dslash_index.cuh>

Expand Down
2 changes: 1 addition & 1 deletion lib/dslash_policy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -982,8 +982,8 @@ struct DslashFusedExterior : DslashPolicyImp {
dslashParam.kernel_type = INTERIOR_KERNEL;
dslashParam.threads = volume;

int scatterIndex = 0;
#ifdef MULTI_GPU
int scatterIndex = 0;
// Record the start of the dslash if doing communication in T and not kernel packing
if (dslashParam.commDim[3] && !(getKernelPackT() || getTwistPack()))
{
Expand Down
2 changes: 1 addition & 1 deletion lib/dslash_quda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
#endif
#endif // GPU_STAGGERED_DIRAC

#include <quda_internal.h>h
#include <quda_internal.h>
#include <dslash_quda.h>
#include <sys/time.h>
#include <blas_quda.h>
Expand Down
6 changes: 6 additions & 0 deletions lib/dslash_twisted_clover.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ namespace quda {
//#define SHARED_WILSON_DSLASH
//#define SHARED_8_BYTE_WORD_SIZE // 8-byte shared memory access

#if (__COMPUTE_CAPABILITY__ >= 200)
#include <tmc_dslash_def.h> // Twisted Clover kernels
#endif

#ifndef DSLASH_SHARED_FLOATS_PER_THREAD
#define DSLASH_SHARED_FLOATS_PER_THREAD 0
Expand Down Expand Up @@ -123,6 +125,7 @@ namespace quda {
#endif
TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

#if (__COMPUTE_CAPABILITY__ >= 200)
switch(dslashType){

case QUDA_DEG_CLOVER_TWIST_INV_DSLASH:
Expand All @@ -142,6 +145,9 @@ namespace quda {
break;
default: errorQuda("Invalid twisted clover dslash type");
}
#else
errorQuda("Twisted-clover fermions not supported on pre-Fermi architecture");
#endif
}

long long flops() const { return (x ? 1416ll : 1392ll) * in->VolumeCB(); } // FIXME for multi-GPU
Expand Down
4 changes: 4 additions & 0 deletions lib/extract_gauge_ghost.cu
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,13 @@ namespace quda {
virtual ~ExtractGhost() { ; }

void apply(const cudaStream_t &stream) {
#if (__COMPUTE_CAPABILITY__ >= 200)
TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
extractGhostKernel<Float, length, nDim, Order>
<<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
#else
errorQuda("extractGhost not supported on pre-Fermi architecture");
#endif
}

TuneKey tuneKey() const { return TuneKey(vol, typeid(*this).name(), aux); }
Expand Down
9 changes: 9 additions & 0 deletions lib/extract_gauge_ghost_extended.cu
Original file line number Diff line number Diff line change
Expand Up @@ -220,21 +220,30 @@ namespace quda {
if (location==QUDA_CPU_FIELD_LOCATION) {
extractGhostEx<Float,length,nDim,Order,true>(arg);
} else {
#if (__COMPUTE_CAPABILITY__ >= 200)
TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
tp.grid.y = 2;
tp.grid.z = 2;
extractGhostExKernel<Float,length,nDim,Order,true>
<<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
#else
errorQuda("extractGhostEx not supported on pre-Fermi architecture");
#endif

}
} else { // we are injecting
if (location==QUDA_CPU_FIELD_LOCATION) {
extractGhostEx<Float,length,nDim,Order,false>(arg);
} else {
#if (__COMPUTE_CAPABILITY__ >= 200)
TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
tp.grid.y = 2;
tp.grid.z = 2;
extractGhostExKernel<Float,length,nDim,Order,false>
<<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
#else
errorQuda("extractGhostEx not supported on pre-Fermi architecture");
#endif
}
}
}
Expand Down
12 changes: 8 additions & 4 deletions lib/ks_force_quda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,6 @@ namespace quda {
{
int idx = threadIdx.x + blockIdx.x*blockDim.x;

if(idx==0){
printf("arg.threads = %d\n", arg.threads);
}

if(idx >= arg.threads) return;
completeKSForceCore<Float,Oprod,Gauge,Mom>(arg,idx);
}
Expand Down Expand Up @@ -168,10 +164,14 @@ namespace quda {

void apply(const cudaStream_t &stream) {
if(location == QUDA_CUDA_FIELD_LOCATION){
#if (__COMPUTE_CAPABILITY__ >= 200)
// Fix this
dim3 blockDim(128, 1, 1);
dim3 gridDim((arg.threads + blockDim.x - 1) / blockDim.x, 1, 1);
completeKSForceKernel<Float><<<gridDim,blockDim>>>(arg);
#else
errorQuda("completeKSForce not supported on pre-Fermi architecture");
#endif
}else{
completeKSForceCPU<Float>(arg);
}
Expand Down Expand Up @@ -385,10 +385,14 @@ class KSLongLinkForce : Tunable {

void apply(const cudaStream_t &stream) {
if(location == QUDA_CUDA_FIELD_LOCATION){
#if (__COMPUTE_CAPABILITY__ >= 200)
// Fix this
dim3 blockDim(128, 1, 1);
dim3 gridDim((arg.threads + blockDim.x - 1) / blockDim.x, 1, 1);
computeKSLongLinkForceKernel<Float><<<gridDim,blockDim>>>(arg);
#else
errorQuda("computeKSLongLinkForce not supported on pre-Fermi architecture");
#endif
}else{
computeKSLongLinkForceCPU<Float>(arg);
}
Expand Down
1 change: 1 addition & 0 deletions lib/misc_helpers.cu
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,7 @@ namespace quda {

case 2:
switch(whichway){
case QUDA_BACKWARDS:
collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity, param);
collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity, param);
break;
Expand Down
2 changes: 2 additions & 0 deletions make.inc.in
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ NVCCOPT += -D__COMPUTE_CAPABILITY__=$(COMP_CAP)
TESLA_ARCH = $(shell [ $(COMP_CAP) -lt 200 ] && echo true)
ifneq ($(TESLA_ARCH),true)
NVCCOPT += -ftz=true -prec-div=false -prec-sqrt=false -w
else
NVCCOPT += -w
endif

ifeq ($(strip $(BUILD_MULTI_GPU)), yes)
Expand Down

0 comments on commit c171d23

Please sign in to comment.